Skip to main content
โšก Calmops

Observability for Microservices: Building Observable Distributed Systems

Observability for Microservices: Building Observable Distributed Systems

TL;DR: This guide covers building observable microservices. Learn the three pillars of observability, distributed tracing, metrics correlation, and building services that provide visibility.


Introduction

Observability in microservices requires:

  1. Logs - Discrete events with timestamps
  2. Metrics - Aggregatable measurements
  3. Traces - Request flow through system

Together, they provide complete visibility.


The Three Pillars

1. Structured Logging

// Correlated logging
func ProcessOrder(ctx context.Context, order Order) error {
    logger := zerolog.Ctx(ctx).
        With().
        Str("order_id", order.ID).
        Str("user_id", order.UserID).
        Logger()
    
    logger.Info().Msg("Processing order")
    
    if err := validateOrder(order); err != nil {
        logger.Error().Err(err).Msg("Order validation failed")
        return err
    }
    
    if err := chargePayment(ctx, order); err != nil {
        logger.Error().Err(err).Msg("Payment failed")
        return err
    }
    
    logger.Info().Msg("Order processed successfully")
    return nil
}

2. Distributed Tracing

// Trace context propagation
func chargePayment(ctx context.Context, order Order) error {
    ctx, span := tracer.Start(ctx, "chargePayment")
    defer span.End()
    
    span.SetAttributes(
        attribute.String("order_id", order.ID),
        attribute.Float64("amount", order.Amount),
    )
    
    // All downstream calls automatically carry trace context
    return paymentClient.Charge(ctx, order)
}

3. Metrics Collection

// Application metrics
var (
    ordersProcessed = prometheus.NewCounterVec(
        prometheus.CounterOpts{
            Name: "orders_processed_total",
            Help: "Total orders processed",
        },
        []string{"status", "service"},
    )
    
    orderDuration = prometheus.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "order_processing_duration_seconds",
            Help:    "Order processing duration",
            Buckets: []float64{0.1, 0.5, 1, 2, 5, 10},
        },
        []string{"service"},
    )
)

Service Mesh Integration

Istio Telemetry

# Istio VirtualService with tracing
apiVersion: networking.istio.io/v1beta1
kind: VirtualService
metadata:
  name: orders-service
spec:
  hosts:
    - orders
  http:
    - match:
        - headers:
            x-request-id:
              present: true
      route:
        - destination:
            host: orders
      headers:
        request:
          set:
            x-b3-traceid: "%[1]s"
            x-b3-spanid: "%[2]s"

OpenTelemetry Collector

# otel-collector.yaml
receivers:
  otlp:
    protocols:
      grpc:
        endpoint: 0.0.0.0:4317
      http:
        endpoint: 0.0.0.0:4318
        
  prometheus:
    config:
      scrape_configs:
        - job_name: 'application'
          static_configs:
            - targets: ['localhost:8080']

processors:
  batch:
    timeout: 5s
    send_batch_size: 1000
    
  memory_limiter:
    check_interval: 1s
    limit_mib: 400
    
  tail_sampling:
    decision_wait: 10s
    policies:
      - name: errors-policy
        type: status_code
        status_code: {status_codesset: [ERROR]}
      - name: slow-traces-policy
        type: latency
        latency: {threshold_ms: 1000}

exporters:
  otlp:
    endpoint: jaeger:4317
    
  prometheus:
    endpoint: 0.0.0.0:8889
    
  loki:
    endpoint: http://loki:3100/loki/api/v1/push

service:
  pipelines:
    traces:
      receivers: [otlp]
      processors: [batch, tail_sampling]
      exporters: [otlp]
    metrics:
      receivers: [prometheus]
      processors: [batch, memory_limiter]
      exporters: [prometheus]
    logs:
      receivers: [otlp]
      processors: [batch]
      exporters: [loki]

Correlation Across Services

Trace Context Propagation

// HTTP client with tracing
func callService(ctx context.Context, url string) (*http.Response, error) {
    req, _ := http.NewRequestWithContext(ctx, "GET", url, nil)
    
    // Inject trace context into headers
    p := propagation.TraceContext{}
    p.Inject(ctx, propagation.HeaderCarrier(req.Header))
    
    return http.DefaultClient.Do(req)
}

// HTTP server with tracing
func handler(w http.ResponseWriter, r *http.Request) {
    // Extract trace context from headers
    p := propagation.TraceContext{}
    ctx := p.Extract(r.Context(), propagation.HeaderCarrier(r.Header))
    
    ctx, span := tracer.Start(ctx, "handleRequest")
    defer span.End()
    
    processRequest(ctx, r)
}

Unified Dashboard Queries

# Correlate metrics with traces
# Find slow requests correlated with error rates
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
  and on(service)
rate(orders_processed_total{status="error"}[5m]) > 0

# Correlate logs with traces
{service="orders"} | json | trace_id!="" | level="error"

Service Level Objectives

SLO Definitions

apiVersion: monitoring.coreos.com/v1
kind: ServiceLevelObjective
metadata:
  name: orders-availability
  namespace: monitoring
spec:
  service: orders.svc.cluster.local
  objective:
    availability: 99.9%
    errorBudget: 0.1%
    alerting:
      name: orders-availability-alert
      expr: |
        1 - (
          sum(rate(orders_success_total[5m])) /
          sum(rate(orders_total[5m]))
        ) > 0.001

Error Budget Burn Rate

# Burn rate over last hour
sum(rate(errors_total[1h])) / sum(rate(errors_total[5m])) / 12

# Burn rate over last 5 minutes (short window)
sum(rate(errors_total[5m]))

Incident Correlation

Alert to Trace Workflow

def on_alert(alert):
    """When high error rate alert fires"""
    # 1. Get service from alert labels
    service = alert.labels['service']
    
    # 2. Query recent errors in Jaeger
    errors = jaeger.query(
        service=service,
        tags={'error': True},
        start_time=alert.fired_at - 15m,
        end_time=alert.fired_at
    )
    
    # 3. Get related logs
    logs = loki.query(
        f'{{service="{service}"}} |= "error"',
        start=alert.fired_at - 15m,
        end=alert.fired_at
    )
    
    # 4. Create incident with all context
    incident = create_incident(
        title=f"High error rate in {service}",
        severity=alert.severity,
        links={
            'jaeger': jaeger.get_trace_url(errors[0].trace_id),
            'logs': logs.link,
            'dashboard': f"https://grafana.com/d/{service}"
        }
    )

Building Observable Services

Best Practices Checklist

  • Use structured logging (JSON)
  • Include correlation IDs in all logs
  • Add trace context to all requests
  • Expose application metrics (RED metrics)
  • Export resource metrics (USE metrics)
  • Define SLOs for critical user journeys
  • Create runbooks for all alerts
  • Build unified dashboards

Conclusion

Observable microservices require:

  1. Three pillars - Logs, metrics, traces working together
  2. Correlation - Link data across services
  3. Service mesh - Automate collection
  4. SLOs - Define and measure reliability
  5. Unified tooling - Correlate during incidents

External Resources


Comments