Skip to main content
โšก Calmops

Modern Observability: Tracing, Metrics, and Logs

Introduction

Modern applications require modern observability. With microservices, Kubernetes, and distributed systems, traditional monitoring isn’t enough. This guide covers the three pillars of observability and how to implement them.


The Three Pillars

Metrics, Logs, and Traces

Pillar What It Shows Tools
Metrics Quantitative measurements over time Prometheus, InfluxDB
Logs Discrete events with timestamps ELK, Loki
Traces Request flow through system Jaeger, Zipkin

Together, they provide complete visibility into system behavior.


Metrics with Prometheus

Setup

# prometheus-config.yaml
global:
  scrape_interval: 15s
  evaluation_interval: 15s

scrape_configs:
  - job_name: 'kubernetes-apiservers'
    kubernetes_sd_configs:
      - role: endpoints
    relabel_configs:
      - source_labels: [__meta_kubernetes_namespace]
        regex: default
        action: keep
      - source_labels: [__meta_kubernetes_endpoint_port_name]
        regex: https
        action: keep

  - job_name: 'myapp'
    static_configs:
      - targets: ['myapp:8080']
    metrics_path: '/metrics'

Custom Metrics

package main

import (
    "github.com/prometheus/client_golang/prometheus"
    "github.com/prometheus/client_golang/prometheus/promhttp"
    "net/http"
)

var (
    requestsTotal = prometheus.NewCounterVec(
        prometheus.CounterOpts{
            Name: "http_requests_total",
            Help: "Total HTTP requests",
        },
        []string{"method", "endpoint", "status"},
    )
    
    requestDuration = prometheus.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "http_request_duration_seconds",
            Help:    "HTTP request latency",
            Buckets: prometheus.DefBuckets,
        },
        []string{"method", "endpoint"},
    )
)

func init() {
    prometheus.MustRegister(requestsTotal)
    prometheus.MustRegister(requestDuration)
}

func middleware(next http.Handler) http.Handler {
    return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
        timer := prometheus.NewTimer(requestDuration.WithLabelValues(r.Method, r.URL.Path))
        defer timer.ObserveTotal()
        
        // ... handle request
        next.ServeHTTP(w, r)
        
        requestsTotal.WithLabelValues(r.Method, r.URL.Path, "200").Inc()
    })
}

func main() {
    http.Handle("/metrics", promhttp.Handler())
    http.ListenAndServe(":8080", nil)
}

Distributed Tracing

OpenTelemetry

package main

import (
    "go.opentelemetry.io/otel"
    "go.opentelemetry.io/otel/exporters/jaeger"
    "go.opentelemetry.io/otel/propagation"
    "go.opentelemetry.io/otel/sdk/resource"
    "go.opentelemetry.io/otel/sdk/trace"
    semconv "go.opentelemetry.io/otel/semconv/v1.21.0"
)

func initTracer() (*trace.TracerProvider, error) {
    exporter, err := jaeger.New(
        jaeger.WithAgentEndpoint(jaeger.WithAgentHost("jaeger")),
    )
    if err != nil {
        return nil, err
    }
    
    res, err := resource.New(context.Background(),
        resource.WithAttributes(
            semconv.ServiceName("my-service"),
        ),
    )
    if err != nil {
        return nil, err
    }
    
    tp := trace.NewTracerProvider(
        trace.WithBatcher(exporter),
        trace.WithResource(res),
    )
    
    otel.SetTracerProvider(tp)
    otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(
        propagation.TraceContext{},
        propagation.Baggage{},
    ))
    
    return tp, nil
}

Instrumentation

func handleRequest(ctx context.Context, operation string) (string, error) {
    tr := otel.Tracer("my-service")
    
    ctx, span := tr.Start(ctx, operation)
    defer span.End()
    
    span.SetAttributes(
        semconv.HTTPMethod("GET"),
        semconv.HTTPRoute("/api/users"),
    )
    
    // Business logic
    result, err := processData(ctx)
    if err != nil {
        span.RecordError(err)
        span.SetAttributes(semconv.HTTPStatusCode(500))
        return "", err
    }
    
    span.SetAttributes(semconv.HTTPStatusCode(200))
    return result, nil
}

Structured Logging

JSON Logging

package main

import (
    "encoding/json"
    "os"
    "time"
)

type LogEntry struct {
    Timestamp  string            `json:"timestamp"`
    Level      string            `json:"level"`
    Message    string            `json:"message"`
    Service    string            `json:"service"`
    Fields     map[string]interface{} `json:"fields,omitempty"`
}

func logJSON(level, msg string, fields map[string]interface{}) {
    entry := LogEntry{
        Timestamp: time.Now().Format(time.RFC3339),
        Level:     level,
        Message:   msg,
        Service:   "my-service",
        Fields:    fields,
    }
    
    jsonBytes, _ := json.Marshal(entry)
    os.Stdout.Write(append(jsonBytes, '\n'))
}

// Usage
func main() {
    logJSON("info", "Request processed", map[string]interface{}{
        "request_id": "abc123",
        "duration_ms": 45,
        "user_id": "user456",
    })
    
    logJSON("error", "Database connection failed", map[string]interface{}{
        "error": "connection refused",
        "host": "db.example.com",
    })
}

Integration

The Observability Stack

# docker-compose.yaml
version: '3.8'

services:
  prometheus:
    image: prom/prometheus:latest
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
    ports:
      - "9090:9090"
  
  grafana:
    image: grafana/grafana:latest
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin
    ports:
      - "3000:3000"
    volumes:
      - ./dashboards:/etc/grafana/provisioning/dashboards
  
  jaeger:
    image: jaegertracing/all-in-one:latest
    ports:
      - "16686:16686"
      - "6831:6831/udp"
  
  loki:
    image: grafana/loki:latest
    ports:
      - "3100:3100"
  
  promtail:
    image: grafana/promtail:latest
    volumes:
      - ./logs:/var/log
      - ./promtail.yml:/etc/promtail/promtail.yml

Best Practices

1. Use Standard Conventions

// Follow OpenTelemetry semantic conventions
span.SetAttributes(
    semconv.HTTPMethod("GET"),
    semconv.HTTPRoute("/api/users"),
    semconv.HTTPTarget("/api/users?id=123"),
    semconv.HTTPStatusCode(200),
    semconv.NetTransportIP,
    semconv.NetHostName("service.example.com"),
)

2. Keep Context

// Propagate context across boundaries
func callDownstreamService(ctx context.Context, url string) (*Response, error) {
    req, _ := http.NewRequestWithContext(ctx, "GET", url, nil)
    
    // Inject trace context into headers
    otel.GetTextMapPropagator().Inject(ctx, propagation.HeaderCarrier(req.Header))
    
    return http.DefaultClient.Do(req)
}

3. Alert on Symptoms, Not Causes

# Good alerts
- alert: HighErrorRate
  expr: sum(rate(http_requests_total{status=~"5.."}[5m])) > 0.1
  annotations:
    summary: "Error rate is above 10%"

- alert: HighLatency
  expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
  annotations:
    summary: "P95 latency above 1 second"

External Resources

Tools

Learning


Key Takeaways

  • Three pillars: Metrics, logs, traces work together
  • OpenTelemetry provides vendor-neutral instrumentation
  • Prometheus handles metrics collection
  • Structured logging enables searching
  • Distributed tracing tracks requests across services
  • Alert on symptoms, not causes
  • Keep context propagated through the system

Comments