Monitoring: Prometheus and Grafana for Python Applications
Effective monitoring is crucial for maintaining reliable production systems. Prometheus and Grafana provide powerful tools for collecting metrics, visualizing data, and alerting on issues. This guide covers practical patterns for monitoring Python applications.
Prometheus Basics
Instrumenting with prometheus-client
from prometheus_client import Counter, Histogram, Gauge, start_http_server
import time
# Define metrics
request_count = Counter(
'http_requests_total',
'Total HTTP requests',
['method', 'endpoint', 'status']
)
request_duration = Histogram(
'http_request_duration_seconds',
'HTTP request duration',
['method', 'endpoint']
)
active_connections = Gauge(
'active_connections',
'Number of active connections'
)
# Usage in Flask
from flask import Flask, request
app = Flask(__name__)
@app.before_request
def before_request():
request.start_time = time.time()
active_connections.inc()
@app.after_request
def after_request(response):
duration = time.time() - request.start_time
request_count.labels(
method=request.method,
endpoint=request.path,
status=response.status_code
).inc()
request_duration.labels(
method=request.method,
endpoint=request.path
).observe(duration)
active_connections.dec()
return response
@app.route('/metrics')
def metrics():
from prometheus_client import generate_latest
return generate_latest()
# Start metrics server
if __name__ == '__main__':
start_http_server(8000)
app.run(port=5000)
Custom Metrics
from prometheus_client import Counter, Gauge, Histogram, Summary
class ApplicationMetrics:
"""Custom metrics for application."""
def __init__(self):
# Counter: monotonically increasing
self.processed_items = Counter(
'processed_items_total',
'Total items processed',
['status']
)
# Gauge: can go up or down
self.queue_size = Gauge(
'queue_size',
'Current queue size'
)
# Histogram: distribution of values
self.processing_time = Histogram(
'processing_time_seconds',
'Time to process item',
buckets=(0.1, 0.5, 1.0, 2.0, 5.0)
)
# Summary: similar to histogram
self.response_size = Summary(
'response_size_bytes',
'Response size in bytes'
)
def record_success(self, duration, size):
"""Record successful processing."""
self.processed_items.labels(status='success').inc()
self.processing_time.observe(duration)
self.response_size.observe(size)
def record_failure(self):
"""Record failed processing."""
self.processed_items.labels(status='failure').inc()
def update_queue_size(self, size):
"""Update queue size."""
self.queue_size.set(size)
# Usage
metrics = ApplicationMetrics()
metrics.record_success(duration=0.5, size=1024)
metrics.update_queue_size(10)
Prometheus Configuration
prometheus.yml Configuration
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'python-app'
static_configs:
- targets: ['localhost:8000']
scrape_interval: 5s
scrape_timeout: 5s
- job_name: 'node-exporter'
static_configs:
- targets: ['localhost:9100']
alerting:
alertmanagers:
- static_configs:
- targets: ['localhost:9093']
rule_files:
- 'alert_rules.yml'
Alert Rules
# alert_rules.yml
groups:
- name: python_app
interval: 30s
rules:
- alert: HighErrorRate
expr: rate(processed_items_total{status="failure"}[5m]) > 0.05
for: 5m
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanizePercentage }}"
- alert: HighQueueSize
expr: queue_size > 1000
for: 2m
annotations:
summary: "Queue size is high"
description: "Queue size is {{ $value }}"
- alert: SlowProcessing
expr: histogram_quantile(0.95, processing_time_seconds) > 5
for: 5m
annotations:
summary: "Processing is slow"
description: "P95 processing time is {{ $value }}s"
Grafana Dashboards
Creating Dashboards Programmatically
from grafana_api.grafana_face import GrafanaFace
class GrafanaDashboardBuilder:
"""Build Grafana dashboards programmatically."""
def __init__(self, url, api_key):
self.grafana = GrafanaFace(
auth=api_key,
host=url
)
def create_dashboard(self, title, panels):
"""Create dashboard with panels."""
dashboard = {
"dashboard": {
"title": title,
"panels": panels,
"timezone": "browser",
"refresh": "30s"
}
}
result = self.grafana.dashboard.update_dashboard(dashboard)
return result
def create_graph_panel(self, title, targets, position):
"""Create graph panel."""
return {
"title": title,
"type": "graph",
"gridPos": position,
"targets": targets,
"yaxes": [
{"format": "short", "label": "Value"}
]
}
def create_stat_panel(self, title, target, position):
"""Create stat panel."""
return {
"title": title,
"type": "stat",
"gridPos": position,
"targets": [target],
"options": {
"colorMode": "value",
"graphMode": "area"
}
}
# Usage
builder = GrafanaDashboardBuilder(
url='http://localhost:3000',
api_key='your-api-key'
)
panels = [
builder.create_graph_panel(
title="Request Rate",
targets=[{
"expr": "rate(http_requests_total[5m])",
"legendFormat": "{{ method }} {{ endpoint }}"
}],
position={"x": 0, "y": 0, "w": 12, "h": 8}
),
builder.create_stat_panel(
title="Active Connections",
target={
"expr": "active_connections",
"legendFormat": "Connections"
},
position={"x": 12, "y": 0, "w": 12, "h": 8}
)
]
# builder.create_dashboard("Python App Monitoring", panels)
Advanced Monitoring Patterns
Application Health Checks
from prometheus_client import Gauge
import requests
class HealthMonitor:
"""Monitor application health."""
def __init__(self):
self.health_status = Gauge(
'app_health_status',
'Application health status',
['component']
)
def check_database(self):
"""Check database connectivity."""
try:
# Attempt database connection
# db.session.execute('SELECT 1')
self.health_status.labels(component='database').set(1)
return True
except Exception as e:
print(f"Database check failed: {e}")
self.health_status.labels(component='database').set(0)
return False
def check_external_api(self, url):
"""Check external API availability."""
try:
response = requests.get(url, timeout=5)
status = 1 if response.status_code == 200 else 0
self.health_status.labels(component='external_api').set(status)
return status == 1
except Exception as e:
print(f"API check failed: {e}")
self.health_status.labels(component='external_api').set(0)
return False
def check_all(self):
"""Check all components."""
checks = {
'database': self.check_database(),
'external_api': self.check_external_api('https://api.example.com')
}
return all(checks.values())
# Usage
monitor = HealthMonitor()
is_healthy = monitor.check_all()
Performance Monitoring
from prometheus_client import Histogram
import functools
import time
class PerformanceMonitor:
"""Monitor function performance."""
def __init__(self):
self.function_duration = Histogram(
'function_duration_seconds',
'Function execution time',
['function_name']
)
def monitor(self, func):
"""Decorator to monitor function performance."""
@functools.wraps(func)
def wrapper(*args, **kwargs):
start = time.time()
try:
result = func(*args, **kwargs)
return result
finally:
duration = time.time() - start
self.function_duration.labels(
function_name=func.__name__
).observe(duration)
return wrapper
# Usage
monitor = PerformanceMonitor()
@monitor.monitor
def slow_function():
time.sleep(0.5)
return "Done"
slow_function()
Business Metrics
from prometheus_client import Counter, Gauge
class BusinessMetrics:
"""Track business-relevant metrics."""
def __init__(self):
self.orders_total = Counter(
'orders_total',
'Total orders',
['status']
)
self.revenue_total = Counter(
'revenue_total',
'Total revenue',
['currency']
)
self.active_users = Gauge(
'active_users',
'Number of active users'
)
def record_order(self, status, amount, currency='USD'):
"""Record order."""
self.orders_total.labels(status=status).inc()
self.revenue_total.labels(currency=currency).inc(amount)
def update_active_users(self, count):
"""Update active user count."""
self.active_users.set(count)
# Usage
metrics = BusinessMetrics()
metrics.record_order(status='completed', amount=99.99)
metrics.update_active_users(150)
Alerting
Alert Manager Configuration
# alertmanager.yml
global:
resolve_timeout: 5m
route:
receiver: 'default'
group_by: ['alertname', 'cluster']
group_wait: 10s
group_interval: 10s
repeat_interval: 12h
routes:
- match:
severity: critical
receiver: 'critical'
continue: true
- match:
severity: warning
receiver: 'warning'
receivers:
- name: 'default'
slack_configs:
- api_url: 'YOUR_SLACK_WEBHOOK_URL'
channel: '#alerts'
- name: 'critical'
slack_configs:
- api_url: 'YOUR_SLACK_WEBHOOK_URL'
channel: '#critical-alerts'
pagerduty_configs:
- service_key: 'YOUR_PAGERDUTY_KEY'
- name: 'warning'
slack_configs:
- api_url: 'YOUR_SLACK_WEBHOOK_URL'
channel: '#warnings'
Programmatic Alerting
import requests
import json
class AlertManager:
"""Send alerts to Alertmanager."""
def __init__(self, alertmanager_url):
self.url = f"{alertmanager_url}/api/v1/alerts"
def send_alert(self, labels, annotations, status='firing'):
"""Send alert to Alertmanager."""
alert = {
"labels": labels,
"annotations": annotations,
"status": status
}
try:
response = requests.post(
self.url,
json=[alert],
timeout=5
)
response.raise_for_status()
print(f"Alert sent: {labels['alertname']}")
except Exception as e:
print(f"Failed to send alert: {e}")
def send_critical_alert(self, title, description):
"""Send critical alert."""
self.send_alert(
labels={
'alertname': title,
'severity': 'critical'
},
annotations={
'summary': title,
'description': description
}
)
# Usage
alert_manager = AlertManager('http://localhost:9093')
alert_manager.send_critical_alert(
title="High Error Rate",
description="Error rate exceeded 5%"
)
Common Pitfalls and Best Practices
โ Bad: Too Many Metrics
# DON'T: Create unlimited metrics
for i in range(1000):
Counter(f'metric_{i}', f'Metric {i}').inc()
โ Good: Use Labels
# DO: Use labels for dimensions
metric = Counter('requests_total', 'Total requests', ['method', 'endpoint'])
metric.labels(method='GET', endpoint='/api/users').inc()
โ Bad: High Cardinality Labels
# DON'T: Use user_id as label (high cardinality)
metric = Counter('requests', 'Requests', ['user_id'])
โ Good: Low Cardinality Labels
# DO: Use low cardinality labels
metric = Counter('requests', 'Requests', ['user_type', 'endpoint'])
โ Bad: No Alerting
# DON'T: Collect metrics without alerting
# Metrics collected but no one knows about issues
โ Good: Meaningful Alerts
# DO: Set up meaningful alerts
# Alert on business impact, not just metrics
Summary
Effective monitoring requires:
- Instrumentation with prometheus-client
- Custom metrics for business and technical insights
- Prometheus configuration for scraping and alerting
- Grafana dashboards for visualization
- Health checks for component status
- Performance monitoring for optimization
- Business metrics for business insights
- Alerting for timely notification
These patterns ensure comprehensive observability of Python applications in production.
Comments