Performance Tuning for Systems in Go
Introduction
System-level performance tuning requires understanding bottlenecks and optimization techniques. This guide covers profiling, benchmarking, and optimization strategies.
Effective performance tuning ensures your system applications run efficiently under load.
Profiling
CPU Profiling
package main
import (
"fmt"
"os"
"runtime/pprof"
)
// StartCPUProfile starts CPU profiling
func StartCPUProfile(filename string) (*os.File, error) {
f, err := os.Create(filename)
if err != nil {
return nil, err
}
if err := pprof.StartCPUProfile(f); err != nil {
f.Close()
return nil, err
}
return f, nil
}
// StopCPUProfile stops CPU profiling
func StopCPUProfile(f *os.File) error {
pprof.StopCPUProfile()
return f.Close()
}
// Example usage
func CPUProfilingExample() {
f, _ := StartCPUProfile("cpu.prof")
defer StopCPUProfile(f)
// Run code to profile
for i := 0; i < 1000000; i++ {
_ = i * i
}
}
Memory Profiling
package main
import (
"os"
"runtime"
"runtime/pprof"
)
// WriteMemProfile writes memory profile
func WriteMemProfile(filename string) error {
f, err := os.Create(filename)
if err != nil {
return err
}
defer f.Close()
runtime.GC()
return pprof.WriteHeapProfile(f)
}
// WriteGoroutineProfile writes goroutine profile
func WriteGoroutineProfile(filename string) error {
f, err := os.Create(filename)
if err != nil {
return err
}
defer f.Close()
return pprof.Lookup("goroutine").WriteTo(f, 0)
}
Good: Proper Performance Tuning
package main
import (
"fmt"
"runtime"
"sync"
"time"
)
// PerformanceMonitor monitors performance metrics
type PerformanceMonitor struct {
startTime time.Time
startMem uint64
}
// NewPerformanceMonitor creates a new monitor
func NewPerformanceMonitor() *PerformanceMonitor {
var m runtime.MemStats
runtime.ReadMemStats(&m)
return &PerformanceMonitor{
startTime: time.Now(),
startMem: m.Alloc,
}
}
// Report reports performance metrics
func (pm *PerformanceMonitor) Report() {
var m runtime.MemStats
runtime.ReadMemStats(&m)
duration := time.Since(pm.startTime)
memUsed := m.Alloc - pm.startMem
fmt.Printf("Duration: %v\n", duration)
fmt.Printf("Memory used: %d MB\n", memUsed/1024/1024)
fmt.Printf("Goroutines: %d\n", runtime.NumGoroutine())
fmt.Printf("GC runs: %d\n", m.NumGC)
}
// Benchmark benchmarks a function
func Benchmark(name string, fn func()) {
monitor := NewPerformanceMonitor()
fn()
fmt.Printf("=== %s ===\n", name)
monitor.Report()
}
// OptimizedLoop demonstrates optimized loop
func OptimizedLoop(n int) int64 {
var sum int64
// Avoid function calls in loop
for i := 0; i < n; i++ {
sum += int64(i)
}
return sum
}
// UnoptimizedLoop demonstrates unoptimized loop
func UnoptimizedLoop(n int) int64 {
var sum int64
// Function call in loop - slower
for i := 0; i < len(make([]int, n)); i++ {
sum += int64(i)
}
return sum
}
// ConcurrencyOptimization demonstrates concurrency optimization
func ConcurrencyOptimization(data []int, workers int) int64 {
chunkSize := len(data) / workers
results := make(chan int64, workers)
var wg sync.WaitGroup
for i := 0; i < workers; i++ {
wg.Add(1)
go func(start int) {
defer wg.Done()
var sum int64
end := start + chunkSize
if i == workers-1 {
end = len(data)
}
for j := start; j < end; j++ {
sum += int64(data[j])
}
results <- sum
}(i * chunkSize)
}
wg.Wait()
close(results)
var total int64
for sum := range results {
total += sum
}
return total
}
// CacheOptimization demonstrates cache optimization
type CacheOptimized struct {
data []int
}
// NewCacheOptimized creates cache-optimized structure
func NewCacheOptimized(size int) *CacheOptimized {
return &CacheOptimized{
data: make([]int, size),
}
}
// Process processes data with cache optimization
func (co *CacheOptimized) Process() int64 {
var sum int64
// Sequential access - better cache locality
for i := 0; i < len(co.data); i++ {
sum += int64(co.data[i])
}
return sum
}
Bad: Improper Performance
package main
// BAD: Function calls in loop
func BadLoop(n int) int64 {
var sum int64
for i := 0; i < n; i++ {
sum += int64(expensiveFunction(i))
}
return sum
}
func expensiveFunction(x int) int {
return x * x
}
// BAD: No concurrency
func BadSequential(data []int) int64 {
var sum int64
for _, v := range data {
sum += int64(v)
}
return sum
}
// BAD: Poor cache locality
func BadCacheLocality(matrix [][]int) int64 {
var sum int64
// Column-major access - poor cache locality
for j := 0; j < len(matrix[0]); j++ {
for i := 0; i < len(matrix); i++ {
sum += int64(matrix[i][j])
}
}
return sum
}
Problems:
- Function calls in loops
- No concurrency
- Poor cache locality
- No optimization
Benchmarking
package main
import (
"testing"
)
// BenchmarkOptimized benchmarks optimized function
func BenchmarkOptimized(b *testing.B) {
for i := 0; i < b.N; i++ {
OptimizedLoop(1000)
}
}
// BenchmarkUnoptimized benchmarks unoptimized function
func BenchmarkUnoptimized(b *testing.B) {
for i := 0; i < b.N; i++ {
UnoptimizedLoop(1000)
}
}
// BenchmarkConcurrency benchmarks concurrency
func BenchmarkConcurrency(b *testing.B) {
data := make([]int, 10000)
for i := range data {
data[i] = i
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
ConcurrencyOptimization(data, 4)
}
}
Best Practices
1. Profile Before Optimizing
monitor := NewPerformanceMonitor()
// Run code
monitor.Report()
2. Benchmark Changes
// Compare before and after
3. Optimize Bottlenecks
// Focus on hot paths
4. Monitor in Production
// Continuous monitoring
Common Pitfalls
1. Premature Optimization
Profile first.
2. No Benchmarking
Always benchmark changes.
3. Ignoring Cache
Consider cache locality.
4. No Monitoring
Monitor production performance.
Resources
Summary
Performance tuning is essential. Key takeaways:
- Profile before optimizing
- Benchmark changes
- Optimize hot paths
- Consider cache locality
- Use concurrency appropriately
- Monitor production
- Test on target hardware
By mastering performance tuning, you can build efficient systems.
Comments