Observability for Microservices: Building Observable Distributed Systems

TL;DR: This guide covers building observable microservices. Learn the three pillars of observability, distributed tracing, metrics correlation, and building services that provide visibility.

Introduction

Observability in microservices requires:

Logs - Discrete events with timestamps
Metrics - Aggregatable measurements
Traces - Request flow through system

Together, they provide complete visibility.

The Three Pillars

1. Structured Logging

// Correlated logging
func ProcessOrder(ctx context.Context, order Order) error {
    logger := zerolog.Ctx(ctx).
        With().
        Str("order_id", order.ID).
        Str("user_id", order.UserID).
        Logger()
    
    logger.Info().Msg("Processing order")
    
    if err := validateOrder(order); err != nil {
        logger.Error().Err(err).Msg("Order validation failed")
        return err
    }
    
    if err := chargePayment(ctx, order); err != nil {
        logger.Error().Err(err).Msg("Payment failed")
        return err
    }
    
    logger.Info().Msg("Order processed successfully")
    return nil
}

2. Distributed Tracing

// Trace context propagation
func chargePayment(ctx context.Context, order Order) error {
    ctx, span := tracer.Start(ctx, "chargePayment")
    defer span.End()
    
    span.SetAttributes(
        attribute.String("order_id", order.ID),
        attribute.Float64("amount", order.Amount),
    )
    
    // All downstream calls automatically carry trace context
    return paymentClient.Charge(ctx, order)
}

3. Metrics Collection

// Application metrics
var (
    ordersProcessed = prometheus.NewCounterVec(
        prometheus.CounterOpts{
            Name: "orders_processed_total",
            Help: "Total orders processed",
        },
        []string{"status", "service"},
    )
    
    orderDuration = prometheus.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "order_processing_duration_seconds",
            Help:    "Order processing duration",
            Buckets: []float64{0.1, 0.5, 1, 2, 5, 10},
        },
        []string{"service"},
    )
)

Service Mesh Integration

Istio Telemetry

# Istio VirtualService with tracing
apiVersion: networking.istio.io/v1beta1
kind: VirtualService
metadata:
  name: orders-service
spec:
  hosts:
    - orders
  http:
    - match:
        - headers:
            x-request-id:
              present: true
      route:
        - destination:
            host: orders
      headers:
        request:
          set:
            x-b3-traceid: "%[1]s"
            x-b3-spanid: "%[2]s"

OpenTelemetry Collector

# otel-collector.yaml
receivers:
  otlp:
    protocols:
      grpc:
        endpoint: 0.0.0.0:4317
      http:
        endpoint: 0.0.0.0:4318
        
  prometheus:
    config:
      scrape_configs:
        - job_name: 'application'
          static_configs:
            - targets: ['localhost:8080']

processors:
  batch:
    timeout: 5s
    send_batch_size: 1000
    
  memory_limiter:
    check_interval: 1s
    limit_mib: 400
    
  tail_sampling:
    decision_wait: 10s
    policies:
      - name: errors-policy
        type: status_code
        status_code: {status_codesset: [ERROR]}
      - name: slow-traces-policy
        type: latency
        latency: {threshold_ms: 1000}

exporters:
  otlp:
    endpoint: jaeger:4317
    
  prometheus:
    endpoint: 0.0.0.0:8889
    
  loki:
    endpoint: http://loki:3100/loki/api/v1/push

service:
  pipelines:
    traces:
      receivers: [otlp]
      processors: [batch, tail_sampling]
      exporters: [otlp]
    metrics:
      receivers: [prometheus]
      processors: [batch, memory_limiter]
      exporters: [prometheus]
    logs:
      receivers: [otlp]
      processors: [batch]
      exporters: [loki]

Correlation Across Services

Trace Context Propagation

// HTTP client with tracing
func callService(ctx context.Context, url string) (*http.Response, error) {
    req, _ := http.NewRequestWithContext(ctx, "GET", url, nil)
    
    // Inject trace context into headers
    p := propagation.TraceContext{}
    p.Inject(ctx, propagation.HeaderCarrier(req.Header))
    
    return http.DefaultClient.Do(req)
}

// HTTP server with tracing
func handler(w http.ResponseWriter, r *http.Request) {
    // Extract trace context from headers
    p := propagation.TraceContext{}
    ctx := p.Extract(r.Context(), propagation.HeaderCarrier(r.Header))
    
    ctx, span := tracer.Start(ctx, "handleRequest")
    defer span.End()
    
    processRequest(ctx, r)
}

Unified Dashboard Queries

# Correlate metrics with traces
# Find slow requests correlated with error rates
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
  and on(service)
rate(orders_processed_total{status="error"}[5m]) > 0

# Correlate logs with traces
{service="orders"} | json | trace_id!="" | level="error"

Service Level Objectives

SLO Definitions

apiVersion: monitoring.coreos.com/v1
kind: ServiceLevelObjective
metadata:
  name: orders-availability
  namespace: monitoring
spec:
  service: orders.svc.cluster.local
  objective:
    availability: 99.9%
    errorBudget: 0.1%
    alerting:
      name: orders-availability-alert
      expr: |
        1 - (
          sum(rate(orders_success_total[5m])) /
          sum(rate(orders_total[5m]))
        ) > 0.001

Error Budget Burn Rate

# Burn rate over last hour
sum(rate(errors_total[1h])) / sum(rate(errors_total[5m])) / 12

# Burn rate over last 5 minutes (short window)
sum(rate(errors_total[5m]))

Incident Correlation

Alert to Trace Workflow

def on_alert(alert):
    """When high error rate alert fires"""
    # 1. Get service from alert labels
    service = alert.labels['service']
    
    # 2. Query recent errors in Jaeger
    errors = jaeger.query(
        service=service,
        tags={'error': True},
        start_time=alert.fired_at - 15m,
        end_time=alert.fired_at
    )
    
    # 3. Get related logs
    logs = loki.query(
        f'{{service="{service}"}} |= "error"',
        start=alert.fired_at - 15m,
        end=alert.fired_at
    )
    
    # 4. Create incident with all context
    incident = create_incident(
        title=f"High error rate in {service}",
        severity=alert.severity,
        links={
            'jaeger': jaeger.get_trace_url(errors[0].trace_id),
            'logs': logs.link,
            'dashboard': f"https://grafana.com/d/{service}"
        }
    )

Building Observable Services

Best Practices Checklist

Use structured logging (JSON)
Include correlation IDs in all logs
Add trace context to all requests
Expose application metrics (RED metrics)
Export resource metrics (USE metrics)
Define SLOs for critical user journeys
Create runbooks for all alerts
Build unified dashboards

Conclusion

Observable microservices require:

Three pillars - Logs, metrics, traces working together
Correlation - Link data across services
Service mesh - Automate collection
SLOs - Define and measure reliability
Unified tooling - Correlate during incidents

Observability for Microservices: Building Observable Distributed Systems

Observability for Microservices: Building Observable Distributed Systems

Introduction

The Three Pillars

1. Structured Logging

2. Distributed Tracing

3. Metrics Collection

Service Mesh Integration

Istio Telemetry

OpenTelemetry Collector

Correlation Across Services

Trace Context Propagation

Unified Dashboard Queries

Service Level Objectives

SLO Definitions

Error Budget Burn Rate

Incident Correlation

Alert to Trace Workflow

Building Observable Services

Best Practices Checklist

Conclusion

External Resources

Comments