Skip to main content
โšก Calmops

Distributed Tracing: OpenTelemetry, Jaeger, and Zipkin Implementation

Distributed Tracing: OpenTelemetry, Jaeger, and Zipkin Implementation

TL;DR: This guide covers implementing distributed tracing in microservices. Learn OpenTelemetry, Jaeger, Zipkin, trace context propagation, and building observable distributed systems.


Introduction

Distributed tracing tracks requests across service boundaries, enabling:

  • Root cause analysis - Find where failures occur
  • Performance optimization - Identify slow operations
  • Dependency mapping - Understand service relationships
  • User journey tracking - Follow requests through the system

OpenTelemetry Basics

Installation

go get go.opentelemetry.io/otel
go.opentelemetry.io/otel/exporters/jaeger
go.opentelemetry.io/otel/sdk

Basic Tracing Setup

package main

import (
    "context"
    "fmt"
    
    "go.opentelemetry.io/otel"
    "go.opentelemetry.io/otel/exporters/jaeger"
    "go.opentelemetry.io/otel/propagation"
    "go.opentelemetry.io/otel/sdk/resource"
    "go.opentelemetry.io/otel/sdk/trace"
    semconv "go.opentelemetry.io/otel/semconv/v1.21.0"
)

func initTracer() (*trace.TracerProvider, error) {
    exp, err := jaeger.New(
        jaeger.WithAgentEndpoint(
            jaeger.WithAgentHost("localhost"),
            jaeger.WithAgentPort("6831"),
        ),
    )
    
    if err != nil {
        return nil, err
    }
    
    tp := trace.NewTracerProvider(
        trace.WithBatcher(exp),
        trace.WithResource(resource.NewWithAttributes(
            semconv.ServiceName("my-service"),
        )),
    )
    
    otel.SetTracerProvider(tp)
    otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
        propagation.TraceContext{},
        propagation.Baggage{},
    ))
    
    return tp, nil
}

Creating Spans

HTTP Handler Tracing

package main

import (
    "net/http"
    
    "go.opentelemetry.io/otel"
    "go.opentelemetry.io/otel/attribute"
    "go.opentelemetry.io/otel/trace"
)

func tracedHandler(w http.ResponseWriter, r *http.Request) {
    ctx, span := otel.Tracer("http-server").Start(
        r.Context(),
        "http.server.request",
        trace.WithAttributes(
            attribute.String("http.method", r.Method),
            attribute.String("http.url", r.URL.String()),
            attribute.String("http.target", r.URL.Path),
        ),
    )
    defer span.End()
    
    // Process request
    processRequest(ctx)
    
    // Add response attributes
    span.SetAttributes(attribute.Int("http.status_code", 200))
    
    w.WriteHeader(http.StatusOK)
}

func processRequest(ctx context.Context) {
    ctx, span := otel.Tracer("application").Start(ctx, "processRequest")
    defer span.End()
    
    // Business logic
    span.SetAttributes(attribute.String("operation", "data-processing"))
}

Database Tracing

import (
    "context"
    
    "go.opentelemetry.io/otel"
    "go.opentelemetry.io/otel/attribute"
)

type TracedDB struct {
    db *sql.DB
}

func (t *TracedDB) QueryContext(ctx context.Context, query string, args ...interface{}) (*sql.Rows, error) {
    ctx, span := otel.Tracer("database").Start(ctx, "db.query")
    defer span.End()
    
    span.SetAttributes(
        attribute.String("db.system", "postgresql"),
        attribute.String("db.query", query),
    )
    
    rows, err := t.db.QueryContext(ctx, query, args...)
    if err != nil {
        span.RecordError(err)
        span.SetAttributes(attribute.Bool("db.error", true))
    }
    
    return rows, err
}

Context Propagation

Propagating Trace Context

import (
    "net/http"
    
    "go.opentelemetry.io/otel/propagation"
)

// HTTP propagation
propagator := propagation.NewCompositeTextMapPropagator(
    propagation.TraceContext{},
    propagation.Baggage{},
)

// Inject into outgoing request
func injectTrace(ctx context.Context, req *http.Request) {
    propagator.Inject(ctx, propagation.HTTPHeaders(req.Header))
}

// Extract from incoming request
func extractTrace(ctx context.Context, req *http.Request) context.Context {
    return propagator.Extract(ctx, propagation.HTTPCarrier(req.Header))
}

Custom Propagator

type CustomPropagator struct{}

func (cp *CustomPropagator) Inject(ctx context.Context, carrier propagation.TextMapCarrier) {
    span := trace.SpanFromContext(ctx)
    if span.SpanContext().HasTraceID() {
        carrier.Set("x-trace-id", span.SpanContext().TraceID().String())
    }
}

func (cp *CustomPropagator) Extract(ctx context.Context, carrier propagation.TextMapCarrier) context.Context {
    traceID := carrier.Get("x-trace-id")
    if traceID != "" {
        sc := trace.NewSpanContext(trace.SpanContextConfig{
            TraceID: trace.ParseTraceID(traceID),
        })
        return trace.ContextWithSpanContext(ctx, sc)
    }
    return ctx
}

Jaeger Integration

Complete Setup

package main

import (
    "log"
    
    "github.com/gin-gonic/gin"
    "go.opentelemetry.io/contrib/instrumentation/github.com/gin-gonic/gin/otelgin"
    "go.opentelemetry.io/otel"
    "go.opentelemetry.io/otel/exporters/jaeger"
    "go.opentelemetry.io/otel/exporters/stdout/stdouttrace"
    "go.opentelemetry.io/otel/sdk/resource"
    semconv "go.opentelemetry.io/otel/semconv/v1.21.0"
)

func main() {
    // Initialize tracer
    tp, err := tracerProvider()
    if err != nil {
        log.Fatal(err)
    }
    defer tp.Shutdown()
    
    // Create Gin router with OpenTelemetry
    r := gin.Default()
    r.Use(otelgin.Middleware("my-service"))
    
    r.GET("/api/users", getUsers)
    r.GET("/api/users/:id", getUser)
    
    r.Run(":8080")
}

func tracerProvider() (*trace.TracerProvider, error) {
    jaegerExp, err := jaeger.New(jaeger.WithAgentEndpoint())
    if err != nil {
        return nil, err
    }
    
    tp := trace.NewTracerProvider(
        trace.WithBatcher(jaegerExp),
        trace.WithResource(resource.NewWithAttributes(
            semconv.ServiceName("my-service"),
            semconv.ServiceVersion("1.0.0"),
        )),
    )
    
    return tp, nil
}

Sampling Strategies

Probabilistic Sampling

import "go.opentelemetry.io/otel/sdk/trace"

func probabilisticSampler(fraction float64) trace.Sampler {
    return trace.TraceIDRatioBased(fraction)
}

// Usage
tp := trace.NewTracerProvider(
    trace.WithSampler(trace.TraceIDRatioBased(0.1)), // Sample 10%
)

Adaptive Sampling

type AdaptiveSampler struct {
    sampleRate    float64
    errorRate    float64
    slowThreshold float64
}

func (as *AdaptiveSampler) ShouldSample(params trace.SamplingParams) trace.SamplingResult {
    // Increase sampling for errors
    if params.ParentContext.Err() != nil {
        return trace.SamplingResult{Decision: trace.RecordAndSample}
    }
    
    // Decrease sampling for healthy services
    if as.errorRate < 0.01 {
        return trace.SamplingResult{Decision: trace.Drop}
    }
    
    return trace.SamplingResult{
        Decision: trace.RecordAndSample,
        Attributes: []attribute.KeyValue{
            attribute.Float64("sampling.rate", as.sampleRate),
        },
    }
}

Trace Analysis

Performance Dashboard Queries

-- Find slowest operations
SELECT 
    service_name,
    operation_name,
    avg(duration_ms) as avg_duration,
    percentile(0.95) as p95_duration,
    count(*) as request_count
FROM spans
WHERE timestamp > NOW() - INTERVAL '1 hour'
GROUP BY service_name, operation_name
ORDER BY avg_duration DESC
LIMIT 20;
-- Find error rates by service
SELECT 
    service_name,
    count(*) FILTER (WHERE status_code >= 500) as error_count,
    count(*) as total_requests,
    count(*) FILTER (WHERE status_code >= 500)::float / count(*) as error_rate
FROM spans
WHERE timestamp > NOW() - INTERVAL '1 hour'
GROUP BY service_name
HAVING count(*) > 100;

Conclusion

Distributed tracing enables:

  1. End-to-end visibility - Track requests across services
  2. Performance optimization - Find slow operations
  3. Root cause analysis - Locate failure points
  4. Dependency mapping - Understand service relationships
  5. User journey analysis - Follow user requests

External Resources


Comments