API Monitoring & Analytics: Complete Guide
Monitoring and analytics are essential for maintaining reliable, high-performance APIs. This guide covers metrics collection, logging, error tracking, and building observability into your APIs.
Why API Monitoring Matters
- Detect issues before they impact users
- Understand API usage patterns
- Optimize performance
- Plan capacity and scaling
- Meet SLA requirements
- Improve developer experience
Key Metrics
The Four Golden Signals
- Latency - How long requests take
- Traffic - How much demand there is
- Errors - How often requests fail
- Saturation - How close to capacity
Essential API Metrics
// Request metrics
- Total requests (count)
- Requests per second
- Request latency (p50, p95, p99)
- Request size (bytes)
- Response size (bytes)
// Error metrics
- Error rate (percentage)
- Error by status code
- Error by endpoint
// Performance metrics
- Throughput (requests/second)
- Response time
- Time to first byte (TTFB)
- DNS lookup time
- TLS handshake time
Request Logging
Structured Logging
// Good: Structured JSON logging
const logger = {
info: (message, meta = {}) => {
console.log(JSON.stringify({
timestamp: new Date().toISOString(),
level: 'INFO',
message,
...meta
}));
},
error: (message, meta = {}) => {
console.error(JSON.stringify({
timestamp: new Date().toISOString(),
level: 'ERROR',
message,
...meta
}));
}
};
// Usage
logger.info('API request', {
method: 'GET',
path: '/api/users/123',
statusCode: 200,
latencyMs: 45,
userId: 'user_456',
requestId: 'req_abc123'
});
logger.error('Request failed', {
method: 'POST',
path: '/api/orders',
statusCode: 500,
error: error.message,
stack: error.stack,
userId: 'user_456'
});
Request/Response Logging Middleware
const requestLogger = (req, res, next) => {
const startTime = Date.now();
const requestId = req.headers['x-request-id'] || uuid();
req.requestId = requestId;
// Log request
logger.info('Incoming request', {
requestId,
method: req.method,
path: req.path,
query: req.query,
ip: req.ip,
userAgent: req.headers['user-agent']
});
// Capture response
const originalSend = res.send;
res.send = function(data) {
const latency = Date.now() - startTime;
logger.info('Request completed', {
requestId,
method: req.method,
path: req.path,
statusCode: res.statusCode,
latencyMs: latency,
responseSize: res.get('Content-Length')
});
originalSend.call(this, data);
};
next();
};
Performance Metrics Collection
Custom Metrics with Prometheus
const promClient = require('prom-client');
const register = new promClient.Registry();
promClient.collectDefaultMetrics({ register });
// Custom metrics
const httpRequestDuration = new promClient.Histogram({
name: 'http_request_duration_seconds',
help: 'Duration of HTTP requests in seconds',
labelNames: ['method', 'route', 'status_code'],
buckets: [0.01, 0.05, 0.1, 0.5, 1, 2, 5]
});
const httpRequestTotal = new promClient.Counter({
name: 'http_requests_total',
help: 'Total number of HTTP requests',
labelNames: ['method', 'route', 'status_code']
});
const apiResponseSize = new promClient.Gauge({
name: 'api_response_size_bytes',
help: 'Response size in bytes',
labelNames: ['method', 'route']
});
register.registerMetric(httpRequestDuration);
register.registerMetric(httpRequestTotal);
register.registerMetric(apiResponseSize);
// Middleware
app.use((req, res, next) => {
const start = Date.now();
res.on('finish', () => {
const duration = (Date.now() - start) / 1000;
const route = req.route ? req.route.path : req.path;
httpRequestDuration.labels(req.method, route, res.statusCode).observe(duration);
httpRequestTotal.labels(req.method, route, res.statusCode).inc();
});
next();
});
app.get('/metrics', async (req, res) => {
res.set('Content-Type', register.contentType);
res.end(await register.metrics());
});
Metrics Dashboard
# Grafana dashboard example
dashboard:
title: "API Performance"
panels:
- title: "Requests per Second"
type: "graph"
targets:
- expr: "rate(http_requests_total[5m])"
- title: "Latency (p95)"
type: "graph"
targets:
- expr: "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))"
- title: "Error Rate"
type: "graph"
targets:
- expr: "sum(rate(http_requests_total{status_code=~'5..'}[5m])) / sum(rate(http_requests_total[5m]))"
Error Tracking
Sentry Integration
const Sentry = require('@sentry/node');
Sentry.init({
dsn: process.env.SENTRY_DSN,
environment: process.env.NODE_ENV,
release: process.env.npm_package_version
});
// Request handler
app.use(Sentry.Handlers.requestHandler());
// Error handler
app.use(Sentry.Handlers.errorHandler());
// Manual error capture
app.get('/api/users/:id', async (req, res) => {
try {
const user = await getUser(req.params.id);
if (!user) {
// Capture with context
Sentry.captureMessage('User not found', {
level: 'warning',
tags: { endpoint: 'getUser', userId: req.params.id }
});
}
res.json(user);
} catch (error) {
// Capture with extra context
Sentry.captureException(error, {
extra: {
userId: req.params.id,
requestId: req.requestId
}
});
throw error;
}
});
Usage Analytics
API Usage by Client
const usageAnalytics = {
// Track usage per API key
async track(key, endpoint, method, statusCode, latency) {
await redis.hincrby(`usage:${key}:daily`, endpoint, 1);
await redis.hincrby(`usage:${key}:monthly`, endpoint, 1);
// Track latency percentiles
await redis.lpush(`latency:${key}:${endpoint}`, latency);
await redis.ltrim(`latency:${key}:${endpoint}`, 0, 999);
},
// Get usage report
async getReport(key, period = 'daily') {
const usage = await redis.hgetall(`usage:${key}:${period}`);
const total = Object.values(usage).reduce((a, b) => a + b, 0);
return {
total,
byEndpoint: usage,
period
};
}
};
// Middleware
app.use(async (req, res, next) => {
const start = Date.now();
res.on('finish', async () => {
const key = req.headers['x-api-key'];
if (key) {
await usageAnalytics.track(
key,
req.path,
req.method,
res.statusCode,
Date.now() - start
);
}
});
next();
});
Usage Dashboard Data
{
"period": "2024-01",
"totalRequests": 1000000,
"uniqueClients": 500,
"topEndpoints": [
{ "path": "/api/users", "requests": 250000 },
{ "path": "/api/products", "requests": 180000 },
{ "path": "/api/orders", "requests": 120000 }
],
"errorRate": 0.5,
"avgLatencyMs": 45,
"p95LatencyMs": 120,
"dataTransferMb": 5000
}
Health Checks
Basic Health Endpoint
app.get('/health', (req, res) => {
res.json({
status: 'healthy',
timestamp: new Date().toISOString(),
uptime: process.uptime(),
version: process.env.npm_package_version
});
});
Detailed Health Check
app.get('/health', async (req, res) => {
const checks = {
database: await checkDatabase(),
cache: await checkCache(),
external: await checkExternalServices()
};
const allHealthy = Object.values(checks).every(c => c.healthy);
res.status(allHealthy ? 200 : 503).json({
status: allHealthy ? 'healthy' : 'unhealthy',
timestamp: new Date().toISOString(),
checks
});
});
async function checkDatabase() {
try {
await db.query('SELECT 1');
return { healthy: true, latency: 5 };
} catch (error) {
return { healthy: false, error: error.message };
}
}
Alerting
Alert Rules
# Prometheus alerting rules
groups:
- name: api_alerts
rules:
- alert: HighErrorRate
expr: sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value }}% over last 5 minutes"
- alert: HighLatency
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "High latency detected"
description: "p95 latency is {{ $value }}s"
- alert: RateLimitNear
expr: rate_limit_remaining / rate_limit_total < 0.1
for: 1m
labels:
severity: warning
annotations:
summary: "Approaching rate limit"
Comments