Distributed Tracing: OpenTelemetry, Jaeger, and Zipkin Implementation
TL;DR: This guide covers implementing distributed tracing in microservices. Learn OpenTelemetry, Jaeger, Zipkin, trace context propagation, and building observable distributed systems.
Introduction
Distributed tracing tracks requests across service boundaries, enabling:
- Root cause analysis - Find where failures occur
- Performance optimization - Identify slow operations
- Dependency mapping - Understand service relationships
- User journey tracking - Follow requests through the system
OpenTelemetry Basics
Installation
go get go.opentelemetry.io/otel
go.opentelemetry.io/otel/exporters/jaeger
go.opentelemetry.io/otel/sdk
Basic Tracing Setup
package main
import (
"context"
"fmt"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/exporters/jaeger"
"go.opentelemetry.io/otel/propagation"
"go.opentelemetry.io/otel/sdk/resource"
"go.opentelemetry.io/otel/sdk/trace"
semconv "go.opentelemetry.io/otel/semconv/v1.21.0"
)
func initTracer() (*trace.TracerProvider, error) {
exp, err := jaeger.New(
jaeger.WithAgentEndpoint(
jaeger.WithAgentHost("localhost"),
jaeger.WithAgentPort("6831"),
),
)
if err != nil {
return nil, err
}
tp := trace.NewTracerProvider(
trace.WithBatcher(exp),
trace.WithResource(resource.NewWithAttributes(
semconv.ServiceName("my-service"),
)),
)
otel.SetTracerProvider(tp)
otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
propagation.TraceContext{},
propagation.Baggage{},
))
return tp, nil
}
Creating Spans
HTTP Handler Tracing
package main
import (
"net/http"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/trace"
)
func tracedHandler(w http.ResponseWriter, r *http.Request) {
ctx, span := otel.Tracer("http-server").Start(
r.Context(),
"http.server.request",
trace.WithAttributes(
attribute.String("http.method", r.Method),
attribute.String("http.url", r.URL.String()),
attribute.String("http.target", r.URL.Path),
),
)
defer span.End()
// Process request
processRequest(ctx)
// Add response attributes
span.SetAttributes(attribute.Int("http.status_code", 200))
w.WriteHeader(http.StatusOK)
}
func processRequest(ctx context.Context) {
ctx, span := otel.Tracer("application").Start(ctx, "processRequest")
defer span.End()
// Business logic
span.SetAttributes(attribute.String("operation", "data-processing"))
}
Database Tracing
import (
"context"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
)
type TracedDB struct {
db *sql.DB
}
func (t *TracedDB) QueryContext(ctx context.Context, query string, args ...interface{}) (*sql.Rows, error) {
ctx, span := otel.Tracer("database").Start(ctx, "db.query")
defer span.End()
span.SetAttributes(
attribute.String("db.system", "postgresql"),
attribute.String("db.query", query),
)
rows, err := t.db.QueryContext(ctx, query, args...)
if err != nil {
span.RecordError(err)
span.SetAttributes(attribute.Bool("db.error", true))
}
return rows, err
}
Context Propagation
Propagating Trace Context
import (
"net/http"
"go.opentelemetry.io/otel/propagation"
)
// HTTP propagation
propagator := propagation.NewCompositeTextMapPropagator(
propagation.TraceContext{},
propagation.Baggage{},
)
// Inject into outgoing request
func injectTrace(ctx context.Context, req *http.Request) {
propagator.Inject(ctx, propagation.HTTPHeaders(req.Header))
}
// Extract from incoming request
func extractTrace(ctx context.Context, req *http.Request) context.Context {
return propagator.Extract(ctx, propagation.HTTPCarrier(req.Header))
}
Custom Propagator
type CustomPropagator struct{}
func (cp *CustomPropagator) Inject(ctx context.Context, carrier propagation.TextMapCarrier) {
span := trace.SpanFromContext(ctx)
if span.SpanContext().HasTraceID() {
carrier.Set("x-trace-id", span.SpanContext().TraceID().String())
}
}
func (cp *CustomPropagator) Extract(ctx context.Context, carrier propagation.TextMapCarrier) context.Context {
traceID := carrier.Get("x-trace-id")
if traceID != "" {
sc := trace.NewSpanContext(trace.SpanContextConfig{
TraceID: trace.ParseTraceID(traceID),
})
return trace.ContextWithSpanContext(ctx, sc)
}
return ctx
}
Jaeger Integration
Complete Setup
package main
import (
"log"
"github.com/gin-gonic/gin"
"go.opentelemetry.io/contrib/instrumentation/github.com/gin-gonic/gin/otelgin"
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/exporters/jaeger"
"go.opentelemetry.io/otel/exporters/stdout/stdouttrace"
"go.opentelemetry.io/otel/sdk/resource"
semconv "go.opentelemetry.io/otel/semconv/v1.21.0"
)
func main() {
// Initialize tracer
tp, err := tracerProvider()
if err != nil {
log.Fatal(err)
}
defer tp.Shutdown()
// Create Gin router with OpenTelemetry
r := gin.Default()
r.Use(otelgin.Middleware("my-service"))
r.GET("/api/users", getUsers)
r.GET("/api/users/:id", getUser)
r.Run(":8080")
}
func tracerProvider() (*trace.TracerProvider, error) {
jaegerExp, err := jaeger.New(jaeger.WithAgentEndpoint())
if err != nil {
return nil, err
}
tp := trace.NewTracerProvider(
trace.WithBatcher(jaegerExp),
trace.WithResource(resource.NewWithAttributes(
semconv.ServiceName("my-service"),
semconv.ServiceVersion("1.0.0"),
)),
)
return tp, nil
}
Sampling Strategies
Probabilistic Sampling
import "go.opentelemetry.io/otel/sdk/trace"
func probabilisticSampler(fraction float64) trace.Sampler {
return trace.TraceIDRatioBased(fraction)
}
// Usage
tp := trace.NewTracerProvider(
trace.WithSampler(trace.TraceIDRatioBased(0.1)), // Sample 10%
)
Adaptive Sampling
type AdaptiveSampler struct {
sampleRate float64
errorRate float64
slowThreshold float64
}
func (as *AdaptiveSampler) ShouldSample(params trace.SamplingParams) trace.SamplingResult {
// Increase sampling for errors
if params.ParentContext.Err() != nil {
return trace.SamplingResult{Decision: trace.RecordAndSample}
}
// Decrease sampling for healthy services
if as.errorRate < 0.01 {
return trace.SamplingResult{Decision: trace.Drop}
}
return trace.SamplingResult{
Decision: trace.RecordAndSample,
Attributes: []attribute.KeyValue{
attribute.Float64("sampling.rate", as.sampleRate),
},
}
}
Trace Analysis
Performance Dashboard Queries
-- Find slowest operations
SELECT
service_name,
operation_name,
avg(duration_ms) as avg_duration,
percentile(0.95) as p95_duration,
count(*) as request_count
FROM spans
WHERE timestamp > NOW() - INTERVAL '1 hour'
GROUP BY service_name, operation_name
ORDER BY avg_duration DESC
LIMIT 20;
-- Find error rates by service
SELECT
service_name,
count(*) FILTER (WHERE status_code >= 500) as error_count,
count(*) as total_requests,
count(*) FILTER (WHERE status_code >= 500)::float / count(*) as error_rate
FROM spans
WHERE timestamp > NOW() - INTERVAL '1 hour'
GROUP BY service_name
HAVING count(*) > 100;
Conclusion
Distributed tracing enables:
- End-to-end visibility - Track requests across services
- Performance optimization - Find slow operations
- Root cause analysis - Locate failure points
- Dependency mapping - Understand service relationships
- User journey analysis - Follow user requests
Comments