Monitoring: Prometheus and Grafana for Python Applications

Effective monitoring is crucial for maintaining reliable production systems. Prometheus and Grafana provide powerful tools for collecting metrics, visualizing data, and alerting on issues. This guide covers practical patterns for monitoring Python applications.

Prometheus Basics

Instrumenting with prometheus-client

from prometheus_client import Counter, Histogram, Gauge, start_http_server
import time

# Define metrics
request_count = Counter(
    'http_requests_total',
    'Total HTTP requests',
    ['method', 'endpoint', 'status']
)

request_duration = Histogram(
    'http_request_duration_seconds',
    'HTTP request duration',
    ['method', 'endpoint']
)

active_connections = Gauge(
    'active_connections',
    'Number of active connections'
)

# Usage in Flask
from flask import Flask, request

app = Flask(__name__)

@app.before_request
def before_request():
    request.start_time = time.time()
    active_connections.inc()

@app.after_request
def after_request(response):
    duration = time.time() - request.start_time
    
    request_count.labels(
        method=request.method,
        endpoint=request.path,
        status=response.status_code
    ).inc()
    
    request_duration.labels(
        method=request.method,
        endpoint=request.path
    ).observe(duration)
    
    active_connections.dec()
    
    return response

@app.route('/metrics')
def metrics():
    from prometheus_client import generate_latest
    return generate_latest()

# Start metrics server
if __name__ == '__main__':
    start_http_server(8000)
    app.run(port=5000)

Custom Metrics

from prometheus_client import Counter, Gauge, Histogram, Summary

class ApplicationMetrics:
    """Custom metrics for application."""
    
    def __init__(self):
        # Counter: monotonically increasing
        self.processed_items = Counter(
            'processed_items_total',
            'Total items processed',
            ['status']
        )
        
        # Gauge: can go up or down
        self.queue_size = Gauge(
            'queue_size',
            'Current queue size'
        )
        
        # Histogram: distribution of values
        self.processing_time = Histogram(
            'processing_time_seconds',
            'Time to process item',
            buckets=(0.1, 0.5, 1.0, 2.0, 5.0)
        )
        
        # Summary: similar to histogram
        self.response_size = Summary(
            'response_size_bytes',
            'Response size in bytes'
        )
    
    def record_success(self, duration, size):
        """Record successful processing."""
        self.processed_items.labels(status='success').inc()
        self.processing_time.observe(duration)
        self.response_size.observe(size)
    
    def record_failure(self):
        """Record failed processing."""
        self.processed_items.labels(status='failure').inc()
    
    def update_queue_size(self, size):
        """Update queue size."""
        self.queue_size.set(size)

# Usage
metrics = ApplicationMetrics()
metrics.record_success(duration=0.5, size=1024)
metrics.update_queue_size(10)

Prometheus Configuration

prometheus.yml Configuration

# prometheus.yml
global:
  scrape_interval: 15s
  evaluation_interval: 15s

scrape_configs:
  - job_name: 'python-app'
    static_configs:
      - targets: ['localhost:8000']
    scrape_interval: 5s
    scrape_timeout: 5s
  
  - job_name: 'node-exporter'
    static_configs:
      - targets: ['localhost:9100']

alerting:
  alertmanagers:
    - static_configs:
        - targets: ['localhost:9093']

rule_files:
  - 'alert_rules.yml'

Alert Rules

# alert_rules.yml
groups:
  - name: python_app
    interval: 30s
    rules:
      - alert: HighErrorRate
        expr: rate(processed_items_total{status="failure"}[5m]) > 0.05
        for: 5m
        annotations:
          summary: "High error rate detected"
          description: "Error rate is {{ $value | humanizePercentage }}"
      
      - alert: HighQueueSize
        expr: queue_size > 1000
        for: 2m
        annotations:
          summary: "Queue size is high"
          description: "Queue size is {{ $value }}"
      
      - alert: SlowProcessing
        expr: histogram_quantile(0.95, processing_time_seconds) > 5
        for: 5m
        annotations:
          summary: "Processing is slow"
          description: "P95 processing time is {{ $value }}s"

Grafana Dashboards

Creating Dashboards Programmatically

from grafana_api.grafana_face import GrafanaFace

class GrafanaDashboardBuilder:
    """Build Grafana dashboards programmatically."""
    
    def __init__(self, url, api_key):
        self.grafana = GrafanaFace(
            auth=api_key,
            host=url
        )
    
    def create_dashboard(self, title, panels):
        """Create dashboard with panels."""
        dashboard = {
            "dashboard": {
                "title": title,
                "panels": panels,
                "timezone": "browser",
                "refresh": "30s"
            }
        }
        
        result = self.grafana.dashboard.update_dashboard(dashboard)
        return result
    
    def create_graph_panel(self, title, targets, position):
        """Create graph panel."""
        return {
            "title": title,
            "type": "graph",
            "gridPos": position,
            "targets": targets,
            "yaxes": [
                {"format": "short", "label": "Value"}
            ]
        }
    
    def create_stat_panel(self, title, target, position):
        """Create stat panel."""
        return {
            "title": title,
            "type": "stat",
            "gridPos": position,
            "targets": [target],
            "options": {
                "colorMode": "value",
                "graphMode": "area"
            }
        }

# Usage
builder = GrafanaDashboardBuilder(
    url='http://localhost:3000',
    api_key='your-api-key'
)

panels = [
    builder.create_graph_panel(
        title="Request Rate",
        targets=[{
            "expr": "rate(http_requests_total[5m])",
            "legendFormat": "{{ method }} {{ endpoint }}"
        }],
        position={"x": 0, "y": 0, "w": 12, "h": 8}
    ),
    builder.create_stat_panel(
        title="Active Connections",
        target={
            "expr": "active_connections",
            "legendFormat": "Connections"
        },
        position={"x": 12, "y": 0, "w": 12, "h": 8}
    )
]

# builder.create_dashboard("Python App Monitoring", panels)

Advanced Monitoring Patterns

Application Health Checks

from prometheus_client import Gauge
import requests

class HealthMonitor:
    """Monitor application health."""
    
    def __init__(self):
        self.health_status = Gauge(
            'app_health_status',
            'Application health status',
            ['component']
        )
    
    def check_database(self):
        """Check database connectivity."""
        try:
            # Attempt database connection
            # db.session.execute('SELECT 1')
            self.health_status.labels(component='database').set(1)
            return True
        except Exception as e:
            print(f"Database check failed: {e}")
            self.health_status.labels(component='database').set(0)
            return False
    
    def check_external_api(self, url):
        """Check external API availability."""
        try:
            response = requests.get(url, timeout=5)
            status = 1 if response.status_code == 200 else 0
            self.health_status.labels(component='external_api').set(status)
            return status == 1
        except Exception as e:
            print(f"API check failed: {e}")
            self.health_status.labels(component='external_api').set(0)
            return False
    
    def check_all(self):
        """Check all components."""
        checks = {
            'database': self.check_database(),
            'external_api': self.check_external_api('https://api.example.com')
        }
        return all(checks.values())

# Usage
monitor = HealthMonitor()
is_healthy = monitor.check_all()

Performance Monitoring

from prometheus_client import Histogram
import functools
import time

class PerformanceMonitor:
    """Monitor function performance."""
    
    def __init__(self):
        self.function_duration = Histogram(
            'function_duration_seconds',
            'Function execution time',
            ['function_name']
        )
    
    def monitor(self, func):
        """Decorator to monitor function performance."""
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            start = time.time()
            try:
                result = func(*args, **kwargs)
                return result
            finally:
                duration = time.time() - start
                self.function_duration.labels(
                    function_name=func.__name__
                ).observe(duration)
        
        return wrapper

# Usage
monitor = PerformanceMonitor()

@monitor.monitor
def slow_function():
    time.sleep(0.5)
    return "Done"

slow_function()

Business Metrics

from prometheus_client import Counter, Gauge

class BusinessMetrics:
    """Track business-relevant metrics."""
    
    def __init__(self):
        self.orders_total = Counter(
            'orders_total',
            'Total orders',
            ['status']
        )
        
        self.revenue_total = Counter(
            'revenue_total',
            'Total revenue',
            ['currency']
        )
        
        self.active_users = Gauge(
            'active_users',
            'Number of active users'
        )
    
    def record_order(self, status, amount, currency='USD'):
        """Record order."""
        self.orders_total.labels(status=status).inc()
        self.revenue_total.labels(currency=currency).inc(amount)
    
    def update_active_users(self, count):
        """Update active user count."""
        self.active_users.set(count)

# Usage
metrics = BusinessMetrics()
metrics.record_order(status='completed', amount=99.99)
metrics.update_active_users(150)

Alerting

Alert Manager Configuration

# alertmanager.yml
global:
  resolve_timeout: 5m

route:
  receiver: 'default'
  group_by: ['alertname', 'cluster']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 12h
  routes:
    - match:
        severity: critical
      receiver: 'critical'
      continue: true
    - match:
        severity: warning
      receiver: 'warning'

receivers:
  - name: 'default'
    slack_configs:
      - api_url: 'YOUR_SLACK_WEBHOOK_URL'
        channel: '#alerts'
  
  - name: 'critical'
    slack_configs:
      - api_url: 'YOUR_SLACK_WEBHOOK_URL'
        channel: '#critical-alerts'
    pagerduty_configs:
      - service_key: 'YOUR_PAGERDUTY_KEY'
  
  - name: 'warning'
    slack_configs:
      - api_url: 'YOUR_SLACK_WEBHOOK_URL'
        channel: '#warnings'

Programmatic Alerting

import requests
import json

class AlertManager:
    """Send alerts to Alertmanager."""
    
    def __init__(self, alertmanager_url):
        self.url = f"{alertmanager_url}/api/v1/alerts"
    
    def send_alert(self, labels, annotations, status='firing'):
        """Send alert to Alertmanager."""
        alert = {
            "labels": labels,
            "annotations": annotations,
            "status": status
        }
        
        try:
            response = requests.post(
                self.url,
                json=[alert],
                timeout=5
            )
            response.raise_for_status()
            print(f"Alert sent: {labels['alertname']}")
        except Exception as e:
            print(f"Failed to send alert: {e}")
    
    def send_critical_alert(self, title, description):
        """Send critical alert."""
        self.send_alert(
            labels={
                'alertname': title,
                'severity': 'critical'
            },
            annotations={
                'summary': title,
                'description': description
            }
        )

# Usage
alert_manager = AlertManager('http://localhost:9093')
alert_manager.send_critical_alert(
    title="High Error Rate",
    description="Error rate exceeded 5%"
)

Common Pitfalls and Best Practices

❌ Bad: Too Many Metrics

# DON'T: Create unlimited metrics
for i in range(1000):
    Counter(f'metric_{i}', f'Metric {i}').inc()

✅ Good: Use Labels

# DO: Use labels for dimensions
metric = Counter('requests_total', 'Total requests', ['method', 'endpoint'])
metric.labels(method='GET', endpoint='/api/users').inc()

❌ Bad: High Cardinality Labels

# DON'T: Use user_id as label (high cardinality)
metric = Counter('requests', 'Requests', ['user_id'])

✅ Good: Low Cardinality Labels

# DO: Use low cardinality labels
metric = Counter('requests', 'Requests', ['user_type', 'endpoint'])

❌ Bad: No Alerting

# DON'T: Collect metrics without alerting
# Metrics collected but no one knows about issues

✅ Good: Meaningful Alerts

# DO: Set up meaningful alerts
# Alert on business impact, not just metrics

Summary

Effective monitoring requires:

Instrumentation with prometheus-client
Custom metrics for business and technical insights
Prometheus configuration for scraping and alerting
Grafana dashboards for visualization
Health checks for component status
Performance monitoring for optimization
Business metrics for business insights
Alerting for timely notification

These patterns ensure comprehensive observability of Python applications in production.