Performance Optimization Techniques

Performance optimization in Go requires understanding both algorithmic efficiency and Go-specific optimizations. This guide covers practical techniques to improve your application’s speed and resource usage.

Algorithmic Optimization

The most impactful optimizations often come from better algorithms.

Good: Efficient Algorithm

package main

import (
	"fmt"
	"sort"
)

// O(n log n) solution using sorting
func findDuplicates(nums []int) []int {
	if len(nums) == 0 {
		return []int{}
	}

	sort.Ints(nums)
	
	seen := make(map[int]bool)
	result := make([]int, 0)
	
	for i := 1; i < len(nums); i++ {
		if nums[i] == nums[i-1] && !seen[nums[i]] {
			result = append(result, nums[i])
			seen[nums[i]] = true
		}
	}
	
	return result
}

func main() {
	nums := []int{1, 2, 2, 3, 3, 3, 4}
	fmt.Println(findDuplicates(nums)) // [2 3]
}

Bad: Inefficient Algorithm

// ❌ AVOID: O(n²) nested loop solution
package main

import (
	"fmt"
)

func findDuplicates(nums []int) []int {
	result := make([]int, 0)
	
	// Nested loops - O(n²)
	for i := 0; i < len(nums); i++ {
		for j := i + 1; j < len(nums); j++ {
			if nums[i] == nums[j] {
				// Check if already in result
				found := false
				for _, v := range result {
					if v == nums[i] {
						found = true
						break
					}
				}
				if !found {
					result = append(result, nums[i])
				}
			}
		}
	}
	
	return result
}

func main() {
	nums := []int{1, 2, 2, 3, 3, 3, 4}
	fmt.Println(findDuplicates(nums))
}

Memory Optimization

Reduce allocations and improve memory usage patterns.

Good: Preallocate Slices

package main

import (
	"fmt"
)

func processData(count int) []int {
	// Preallocate with exact capacity
	result := make([]int, 0, count)
	
	for i := 0; i < count; i++ {
		result = append(result, i*2)
	}
	
	return result
}

func main() {
	data := processData(1000)
	fmt.Println("Processed:", len(data))
}

Bad: Repeated Allocations

// ❌ AVOID: Slice grows repeatedly
package main

func processData(count int) []int {
	var result []int // No capacity
	
	for i := 0; i < count; i++ {
		// Causes repeated allocations and copying
		result = append(result, i*2)
	}
	
	return result
}

Good: Reuse Buffers

package main

import (
	"bytes"
	"fmt"
)

type DataProcessor struct {
	buffer *bytes.Buffer
}

func NewDataProcessor() *DataProcessor {
	return &DataProcessor{
		buffer: new(bytes.Buffer),
	}
}

func (dp *DataProcessor) Process(data []byte) string {
	dp.buffer.Reset() // Reuse buffer
	dp.buffer.Write(data)
	dp.buffer.WriteString("_processed")
	return dp.buffer.String()
}

func main() {
	processor := NewDataProcessor()
	
	for i := 0; i < 1000; i++ {
		result := processor.Process([]byte("data"))
		_ = result
	}
	
	fmt.Println("Processing complete")
}

Bad: Creating New Buffers

// ❌ AVOID: New buffer for each operation
package main

import (
	"bytes"
)

func processData(data []byte) string {
	// Creates new buffer each time
	buffer := new(bytes.Buffer)
	buffer.Write(data)
	buffer.WriteString("_processed")
	return buffer.String()
}

func main() {
	for i := 0; i < 1000; i++ {
		processData([]byte("data"))
	}
}

String Optimization

Strings are immutable in Go; optimize string operations carefully.

Good: Use strings.Builder

package main

import (
	"fmt"
	"strings"
)

func buildString(parts []string) string {
	builder := strings.Builder{}
	
	for i, part := range parts {
		builder.WriteString(part)
		if i < len(parts)-1 {
			builder.WriteString(", ")
		}
	}
	
	return builder.String()
}

func main() {
	parts := []string{"hello", "world", "go"}
	result := buildString(parts)
	fmt.Println(result) // hello, world, go
}

Bad: String Concatenation

// ❌ AVOID: String concatenation in loops
package main

import (
	"fmt"
)

func buildString(parts []string) string {
	result := ""
	
	for i, part := range parts {
		result += part
		if i < len(parts)-1 {
			result += ", "
		}
	}
	
	return result
}

func main() {
	parts := []string{"hello", "world", "go"}
	result := buildString(parts)
	fmt.Println(result)
}

Concurrency Optimization

Use goroutines and channels efficiently.

Good: Worker Pool Pattern

package main

import (
	"fmt"
	"sync"
)

type Job struct {
	id   int
	data string
}

type Result struct {
	job    Job
	result string
}

func worker(jobs <-chan Job, results chan<- Result, wg *sync.WaitGroup) {
	defer wg.Done()
	
	for job := range jobs {
		// Process job
		result := Result{
			job:    job,
			result: fmt.Sprintf("processed_%s", job.data),
		}
		results <- result
	}
}

func main() {
	numWorkers := 4
	numJobs := 100
	
	jobs := make(chan Job, numJobs)
	results := make(chan Result, numJobs)
	
	var wg sync.WaitGroup
	
	// Start workers
	for i := 0; i < numWorkers; i++ {
		wg.Add(1)
		go worker(jobs, results, &wg)
	}
	
	// Send jobs
	go func() {
		for i := 0; i < numJobs; i++ {
			jobs <- Job{id: i, data: fmt.Sprintf("job_%d", i)}
		}
		close(jobs)
	}()
	
	// Collect results
	go func() {
		wg.Wait()
		close(results)
	}()
	
	count := 0
	for result := range results {
		count++
		_ = result
	}
	
	fmt.Printf("Processed %d jobs\n", count)
}

Bad: Goroutine Per Task

// ❌ AVOID: Creating goroutine for each task
package main

import (
	"fmt"
	"sync"
)

func main() {
	var wg sync.WaitGroup
	
	// Creates 10000 goroutines - wasteful!
	for i := 0; i < 10000; i++ {
		wg.Add(1)
		go func(id int) {
			defer wg.Done()
			// Process task
			_ = id
		}(i)
	}
	
	wg.Wait()
	fmt.Println("Done")
}

Lock Optimization

Minimize lock contention and critical sections.

Good: Fine-Grained Locking

package main

import (
	"fmt"
	"sync"
)

type Cache struct {
	mu    sync.RWMutex
	items map[string]string
}

func NewCache() *Cache {
	return &Cache{
		items: make(map[string]string),
	}
}

func (c *Cache) Get(key string) (string, bool) {
	c.mu.RLock() // Read lock - allows concurrent reads
	defer c.mu.RUnlock()
	
	val, ok := c.items[key]
	return val, ok
}

func (c *Cache) Set(key, value string) {
	c.mu.Lock() // Write lock - exclusive access
	defer c.mu.Unlock()
	
	c.items[key] = value
}

func main() {
	cache := NewCache()
	cache.Set("key1", "value1")
	
	val, ok := cache.Get("key1")
	fmt.Printf("Got: %s (exists: %v)\n", val, ok)
}

Bad: Coarse-Grained Locking

// ❌ AVOID: Holding lock during I/O
package main

import (
	"sync"
	"time"
)

type Cache struct {
	mu    sync.Mutex
	items map[string]string
}

func (c *Cache) GetAndFetch(key string) string {
	c.mu.Lock()
	defer c.mu.Unlock()
	
	// Holding lock during I/O - blocks other operations!
	time.Sleep(100 * time.Millisecond)
	
	return c.items[key]
}

Caching Strategies

Reduce redundant computations with caching.

Good: Memoization

package main

import (
	"fmt"
	"sync"
)

type Fibonacci struct {
	mu    sync.Mutex
	cache map[int]int
}

func NewFibonacci() *Fibonacci {
	return &Fibonacci{
		cache: make(map[int]int),
	}
}

func (f *Fibonacci) Calculate(n int) int {
	f.mu.Lock()
	if val, ok := f.cache[n]; ok {
		f.mu.Unlock()
		return val
	}
	f.mu.Unlock()
	
	var result int
	if n <= 1 {
		result = n
	} else {
		result = f.Calculate(n-1) + f.Calculate(n-2)
	}
	
	f.mu.Lock()
	f.cache[n] = result
	f.mu.Unlock()
	
	return result
}

func main() {
	fib := NewFibonacci()
	fmt.Println(fib.Calculate(40))
}

Loop Optimization

Optimize hot loops for better performance.

Good: Efficient Loop

package main

import (
	"fmt"
)

func sumArray(arr []int) int {
	sum := 0
	
	// Simple, efficient loop
	for _, v := range arr {
		sum += v
	}
	
	return sum
}

func main() {
	arr := make([]int, 1000000)
	for i := range arr {
		arr[i] = i
	}
	
	result := sumArray(arr)
	fmt.Println("Sum:", result)
}

Bad: Inefficient Loop

// ❌ AVOID: Unnecessary operations in loop
package main

func sumArray(arr []int) int {
	sum := 0
	
	// Inefficient: len() called each iteration
	for i := 0; i < len(arr); i++ {
		// Inefficient: bounds check each iteration
		sum += arr[i]
	}
	
	return sum
}

Inlining Optimization

Small functions can be inlined by the compiler.

Good: Inline-Friendly Functions

package main

import (
	"fmt"
)

// Small function - likely to be inlined
func add(a, b int) int {
	return a + b
}

// Larger function - won't be inlined
func complexCalculation(a, b, c int) int {
	result := a + b
	result *= c
	result -= a
	result /= 2
	return result
}

func main() {
	result := add(5, 3)
	fmt.Println(result)
}

Benchmarking for Optimization

Use benchmarks to measure improvements.

Benchmark Example

package main

import (
	"testing"
)

func BenchmarkStringConcat(b *testing.B) {
	for i := 0; i < b.N; i++ {
		result := ""
		for j := 0; j < 100; j++ {
			result += "x"
		}
	}
}

func BenchmarkStringBuilder(b *testing.B) {
	for i := 0; i < b.N; i++ {
		var builder strings.Builder
		for j := 0; j < 100; j++ {
			builder.WriteString("x")
		}
		_ = builder.String()
	}
}

Best Practices

Profile First: Use profiling to identify actual bottlenecks
Measure Changes: Always benchmark before and after optimizations
Optimize Algorithms: Focus on algorithmic improvements first
Reduce Allocations: Minimize memory allocations in hot paths
Use Appropriate Data Structures: Choose the right data structure for your use case
Leverage Concurrency: Use goroutines for I/O-bound operations
Minimize Lock Contention: Use fine-grained locking and RWMutex
Cache Strategically: Cache expensive computations

Common Pitfalls

Premature Optimization: Optimize based on profiling, not assumptions
Over-Optimization: Don’t sacrifice readability for marginal gains
Ignoring Allocations: Memory allocations are often the bottleneck
Lock Contention: Coarse-grained locking can kill performance
Goroutine Overhead: Creating too many goroutines wastes resources

Resources

Summary

Performance optimization in Go requires a systematic approach: profile to identify bottlenecks, optimize algorithms first, then focus on memory and concurrency. Use benchmarks to validate improvements and avoid premature optimization. Remember that readable, maintainable code is often more valuable than micro-optimizations.