Custom Metrics: Application Instrumentation with OpenTelemetry

Introduction

Custom metrics provide deep insights into application behavior beyond standard infrastructure metrics. This article covers OpenTelemetry instrumentation patterns, metric types, and implementation best practices.

Key Statistics:

Custom metrics: 3-5x more actionable than infrastructure metrics
Proper instrumentation reduces MTTR by 60%
OpenTelemetry: 500+ supported integrations

Metric Types

┌─────────────────────────────────────────────────────────────────┐
│                    OpenTelemetry Metric Types                             │
├─────────────────────────────────────────────────────────────────┤
│                                                                  │
│  Counter (monotonic)                                             │
│  ├── Always increases (requests, errors)                        │
│  ├── Use for: counts, totals, cumulative values                │
│  └── Example: total_requests, total_errors                     │
│                                                                  │
│  Gauge (point-in-time)                                           │
│  ├── Can increase or decrease (memory, CPU)                    │
│  ├── Use for: current values, snapshots                        │
│  └── Example: memory_usage, active_connections                 │
│                                                                  │
│  Histogram (distribution)                                        │
│  ├── Buckets for percentiles                                    │
│  ├── Use for: latency, sizes, durations                         │
│  └── Example: request_duration, response_size                  │
│                                                                  │
│  UpDownCounter (bidirectional)                                   │
│  ├── Can increase or decrease (counter)                        │
│  ├── Use for: queue depth, concurrent requests                 │
│  └── Example: queue_size, active_workers                       │
│                                                                  │
│  Observable (callback)                                           │
│  ├── Values provided by callback function                       │
│  ├── Use for: system metrics, derived values                    │
│  └── Example: disk_usage, cpu_temp                              │
│                                                                  │
└─────────────────────────────────────────────────────────────────┘

OpenTelemetry Python SDK

Basic Instrumentation

#!/usr/bin/env python3
"""OpenTelemetry Python instrumentation."""

from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.sdk.resources import Resource
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry import metrics
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
from opentelemetry.sdk.resources import Resource, SERVICE_NAME

# Setup tracing
trace.set_tracer_provider(
    TracerProvider(
        resource=Resource.create({SERVICE_NAME: "my-service"})
    )
)

# Setup metrics
metric_reader = PeriodicExportingMetricReader(
    OTLPMetricExporter(endpoint="localhost:4317", insecure=True)
)

metrics.set_meter_provider(
    MeterProvider(
        resource=Resource.create({SERVICE_NAME: "my-service"}),
        metric_readers=[metric_reader]
    )
)

# Get tracer and meter
tracer = trace.get_tracer(__name__)
meter = metrics.get_meter(__name__)

# ============== Custom Metrics ==============

# Counter: monotonically increasing
request_counter = meter.create_counter(
    name="http.requests.total",
    description="Total number of HTTP requests",
    unit="1",
)

# Gauge: current value
active_connections = meter.create_gauge(
    name="http.connections.active",
    description="Number of active HTTP connections",
    unit="1",
)

# Histogram: distribution
request_duration = meter.create_histogram(
    name="http.request.duration",
    description="HTTP request duration in seconds",
    unit="s",
)

# UpDownCounter: bidirectional
queue_size = meter.create_up_down_counter(
    name="queue.size",
    description="Current queue size",
    unit="1",
)

# Example instrumentation in HTTP handler
def handle_request(request):
    # Add to counter
    request_counter.add(1, {"method": request.method, "path": request.path})
    
    # Record duration
    with tracer.start_as_current_span("handle_request") as span:
        span.set_attribute("http.method", request.method)
        span.set_attribute("http.url", request.path)
        
        start_time = time.time()
        
        try:
            result = process_request(request)
            span.set_attribute("http.status_code", 200)
            return result
        except Exception as e:
            span.set_attribute("http.status_code", 500)
            span.record_exception(e)
            raise
        finally:
            duration = time.time() - start_time
            request_duration.record(duration, {"method": request.method})

Advanced Custom Metrics

#!/usr/bin/env python3
"""Advanced OpenTelemetry metrics patterns."""

from opentelemetry import metrics
from opentelemetry.sdk.metrics.view import View, Aggregation
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import AggregationTemporality
from typing import Dict, List
import time

class BusinessMetrics:
    """Business-level custom metrics."""
    
    def __init__(self, meter):
        self.meter = meter
        
        # Revenue tracking
        self.revenue = meter.create_counter(
            name="business.revenue.total",
            description="Total revenue in USD",
            unit="USD",
        )
        
        # User metrics
        self.active_users = meter.create_up_down_counter(
            name="business.users.active",
            description="Number of active users",
            unit="1",
        )
        
        # Order metrics
        self.order_value = meter.create_histogram(
            name="business.order.value",
            description="Order value in USD",
            unit="USD",
        )
        
        # Conversion funnel
        self.funnel_steps = meter.create_counter(
            name="business.funnel.step",
            description="Funnel step completions",
            unit="1",
        )
    
    def record_revenue(self, amount: float, currency: str, 
                      product: str):
        """Record revenue event."""
        
        self.revenue.add(
            amount,
            {
                "currency": currency,
                "product": product
            }
        )
    
    def track_funnel(self, step: str, user_id: str):
        """Track funnel progression."""
        
        self.funnel_steps.add(
            1,
            {
                "step": step,
                "user_id": user_id
            }
        )

class PerformanceMetrics:
    """Performance-focused custom metrics."""
    
    def __init__(self, meter):
        self.meter = meter
        
        # Latency percentiles
        self.latency = meter.create_histogram(
            name="app.latency",
            description="Operation latency in milliseconds",
            unit="ms",
            # Configure explicit bucket boundaries
            # boundaries=[0, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000, 10000]
        )
        
        # Throughput
        self.throughput = meter.create_counter(
            name="app.throughput",
            description="Operations per second",
            unit="ops",
        )
        
        # Error rate
        self.errors = meter.create_counter(
            name="app.errors.total",
            description="Total number of errors",
            unit="1",
        )
        
        # Size metrics
        self.payload_size = meter.create_histogram(
            name="app.payload.size",
            description="Request/response payload size",
            unit="bytes",
        )
    
    def record_latency(self, operation: str, duration_ms: float,
                      success: bool):
        """Record operation latency."""
        
        self.latency.record(
            duration_ms,
            {
                "operation": operation,
                "success": str(success)
            }
        )
        
        if not success:
            self.errors.add(1, {"operation": operation})

class CustomMetricsMiddleware:
    """Middleware for automatic instrumentation."""
    
    def __init__(self, app, meter):
        self.app = app
        self.request_counter = meter.create_counter(
            name="http.server.requests.total",
            description="Total HTTP requests",
            unit="1",
        )
        self.request_duration = meter.create_histogram(
            name="http.server.request.duration",
            description="HTTP request duration",
            unit="ms",
        )
    
    async def __call__(self, scope, receive, send):
        """Process HTTP request with instrumentation."""
        
        start_time = time.perf_counter()
        
        # Extract request info
        method = scope.get('method', 'GET')
        path = scope.get('path', '/')
        
        # Add request
        self.request_counter.add(
            1,
            {
                "method": method,
                "path": path,
                "host": scope.get("headers", {}).get("host", "")
            }
        )
        
        # Process request
        await self.app(scope, receive, send)
        
        # Record duration
        duration_ms = (time.perf_counter() - start_time) * 1000
        self.request_duration.record(
            duration_ms,
            {
                "method": method,
                "path": path
            }
        )

OpenTelemetry Collector

# OpenTelemetry Collector configuration for custom metrics
receivers:
  otlp:
    protocols:
      grpc:
        endpoint: 0.0.0.0:4317
      http:
        endpoint: 0.0.0.0:4318
  
  prometheus:
    config:
      scrape_configs:
        - job_name: 'node'
          static_configs:
            - targets: ['localhost:9090']
        
        - job_name: 'custom-metrics'
          static_configs:
            - targets: ['localhost:8080']

processors:
  batch:
    timeout: 10s
    send_batch_size: 1000
  
  # Custom metrics transformations
  metricstransform:
    transforms:
      - include: "http.requests.total"
        action: update
        operations:
          - action: add_label
            new_label: "environment"
            value: "production"
          - action: update_label
            label: "method"
            new_label: "http.method"
      
      - include: "app.latency"
        action: insert
        new_name: "app.latency.histogram"
  
  # Filter unwanted metrics
  filter:
    metric_views:
      include:
        match_type: "regexp"
        metric_names:
          - "http\..*"
          - "app\..*"
          - "business\..*"
      exclude:
        match_type: "regexp"
        metric_names:
          - "internal\..*"

exporters:
  otlp:
    endpoint: "https://tempo.example.com:4317"
    tls:
      insecure: false
  
  prometheus:
    endpoint: "0.0.0.0:8889"
    namespace: "custom"
  
  loki:
    endpoint: "https://loki.example.com/loki/api/v1/push"

service:
  pipelines:
    metrics:
      receivers: [otlp, prometheus]
      processors: [batch, metricstransform, filter]
      exporters: [otlp]
    
    logs:
      receivers: [otlp]
      processors: [batch]
      exporters: [loki]

JavaScript/Node.js Instrumentation

// OpenTelemetry JavaScript instrumentation
const { NodeSDK } = require('@opentelemetry/sdk-node');
const { getNodeAutoInstrumentations } = require('@opentelemetry/auto-instrumentations-node');
const { OTLPTraceExporter } = require('@opentelemetry/exporter-trace-otlp-grpc');
const { OTLPMetricExporter } = require('@opentelemetry/exporter-metrics-otlp-grpc');
const { PeriodicExportingMetricReader } = require('@opentelemetry/sdk-metrics');
const { Resource } = require('@opentelemetry/resources');
const { ATTR_SERVICE_NAME } = require('@opentelemetry/semantic-conventions');

// Configure SDK
const sdk = new NodeSDK({
  resource: new Resource({
    [ATTR_SERVICE_NAME]: 'my-node-service',
    'deployment.environment': 'production',
  }),
  
  traceExporter: new OTLPTraceExporter(),
  
  metricReader: new PeriodicExportingMetricReader({
    exporter: new OTLPMetricExporter(),
    exportIntervalMillis: 10000,
  }),
  
  instrumentations: [
    getNodeAutoInstrumentations(),
  ],
});

sdk.start();

// ============== Custom Metrics ==============
const { metrics } = require('@opentelemetry/api');

const meter = metrics.getMeter('my-service');

// Counter
const requestCounter = meter.createCounter('http.requests.total', {
  description: 'Total HTTP requests',
});

// Gauge
const activeConnections = meter.createGauge('http.connections.active', {
  description: 'Active connections',
});

// Histogram
const requestDuration = meter.createHistogram('http.request.duration', {
  description: 'Request duration in ms',
  unit: 'ms',
  // Explicit bucket boundaries
  boundaries: [10, 50, 100, 250, 500, 1000, 2500, 5000],
});

// UpDownCounter
const queueSize = meter.createUpDownCounter('queue.size', {
  description: 'Queue size',
});

// Example middleware
function metricsMiddleware(req, res, next) {
  const startTime = Date.now();
  
  // Count request
  requestCounter.add(1, {
    method: req.method,
    path: req.route?.path || req.path,
    status_code: res.statusCode,
  });
  
  res.on('finish', () => {
    // Record duration
    const duration = Date.now() - startTime;
    requestDuration.record(duration, {
      method: req.method,
      path: req.route?.path || req.path,
    });
  });
  
  next();
}

// Custom business metrics
const businessMeter = meter.createMeter('business');

const revenueCounter = businessMeter.createCounter('revenue.total', {
  description: 'Total revenue',
  unit: 'USD',
});

function recordTransaction(amount, currency, product) {
  revenueCounter.add(amount, {
    currency,
    product,
  });
}

Prometheus Integration

# Prometheus scrape configuration for custom metrics
global:
  scrape_interval: 15s
  evaluation_interval: 15s

scrape_configs:
  # Custom application metrics
  - job_name: 'my-application'
    metrics_path: '/metrics'
    static_configs:
      - targets: ['localhost:8080']
        labels:
          service: 'my-app'
          environment: 'production'
    
    # Metric relabeling
    metric_relabel_configs:
      # Add environment label
      - source_labels: [__name__]
        regex: 'http\.(.*)'
        target_label: __name__
        replacement: 'app_http_${1}'
      
      # Drop internal metrics
      - source_labels: [__name__]
        regex: 'internal\..*'
        action: drop

  # Prometheus rules for custom metrics
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']

# Recording rules
rule_files:
  - '/etc/prometheus/rules/*.yml'

# Prometheus recording rules for custom metrics
groups:
  - name: application.custom
    interval: 30s
    rules:
      # HTTP request rate
      - record: http:requests:rate5m
        expr: rate(http_requests_total[5m])
      
      # Error rate
      - record: http:errors:rate5m
        expr: |
          sum(rate(http_requests_total{status=~"5.."}[5m])) 
          / 
          sum(rate(http_requests_total[5m]))
      
      # Latency histogram quantiles
      - record: http:latency:p95
        expr: histogram_quantile(0.95, http_request_duration_seconds_bucket)
      
      - record: http:latency:p99
        expr: histogram_quantile(0.99, http_request_duration_seconds_bucket)
      
      # Business metrics
      - record: business:revenue:total
        expr: sum(business_revenue_total)
      
      - record: business:orders:rate1h
        expr: sum(rate(business_orders_total[1h]))

Grafana Dashboard

{
  "dashboard": {
    "title": "Custom Metrics Overview",
    "panels": [
      {
        "title": "Request Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(http_requests_total[5m])",
            "legendFormat": "{{method}} {{path}}"
          }
        ]
      },
      {
        "title": "Error Rate",
        "type": "stat",
        "targets": [
          {
            "expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])) * 100",
            "unit": "percent"
          }
        ]
      },
      {
        "title": "Latency p95",
        "type": "graph",
        "targets": [
          {
            "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))",
            "legendFormat": "p95"
          }
        ]
      },
      {
        "title": "Active Users",
        "type": "gauge",
        "targets": [
          {
            "expr": "business_users_active"
          }
        ]
      },
      {
        "title": "Revenue",
        "type": "graph",
        "targets": [
          {
            "expr": "increase(business_revenue_total[1h])",
            "legendFormat": "{{product}}"
          }
        ]
      }
    ]
  }
}

Best Practices

Metric Naming

┌─────────────────────────────────────────────────────────────────┐
│                    Metric Naming Conventions                              │
├─────────────────────────────────────────────────────────────────┤
│                                                                  │
│  Structure: <domain>.<category>.<name>                         │
│                                                                  │
│  Examples:                                                       │
│  ├── http.request.duration    (Good)                          │
│  ├── request_duration_ms      (Bad - missing domain)          │
│  ├── requests                 (Bad - too generic)              │
│                                                                  │
│  Labels (dimensions):                                            │
│  ├── method: GET, POST, PUT, DELETE                            │
│  ├── status_code: 200, 400, 500                                │
│  ├── path: /api/users, /api/orders                             │
│  ├── environment: prod, staging, dev                          │
│                                                                  │
│  Units:                                                          │
│  ├── Duration: seconds (s), milliseconds (ms)                 │
│  ├── Bytes: bytes, kilobytes (kb), megabytes (mb)              │
│  ├── Counts: 1 (no unit)                                       │
│  └── Currency: USD, EUR                                        │
│                                                                  │
└─────────────────────────────────────────────────────────────────┘

Cardinality Management

#!/usr/bin/env python3
"""Avoid high cardinality in metrics."""

# BAD: High cardinality - every user becomes a label
user_counter.add(1, {"user_id": user.id})  # Millions of unique values!

# GOOD: Aggregate metrics instead
user_counter.add(1, {"country": user.country, "plan": user.plan})

# GOOD: Use histograms for distributions
request_duration.record(duration, {"endpoint": "/api/users"})

# BAD: Timestamp as label
counter.add(1, {"timestamp": "2026-02-18T10:00:00Z"})

# GOOD: Use time-based queries instead
rate(counter[5m])