API Monitoring & Analytics: Complete Guide

Monitoring and analytics are essential for maintaining reliable, high-performance APIs. This guide covers metrics collection, logging, error tracking, and building observability into your APIs.

Why API Monitoring Matters

Detect issues before they impact users
Understand API usage patterns
Optimize performance
Plan capacity and scaling
Meet SLA requirements
Improve developer experience

Key Metrics

The Four Golden Signals

Latency - How long requests take
Traffic - How much demand there is
Errors - How often requests fail
Saturation - How close to capacity

Essential API Metrics

// Request metrics
- Total requests (count)
- Requests per second
- Request latency (p50, p95, p99)
- Request size (bytes)
- Response size (bytes)

// Error metrics  
- Error rate (percentage)
- Error by status code
- Error by endpoint

// Performance metrics
- Throughput (requests/second)
- Response time
- Time to first byte (TTFB)
- DNS lookup time
- TLS handshake time

Request Logging

Structured Logging

// Good: Structured JSON logging
const logger = {
  info: (message, meta = {}) => {
    console.log(JSON.stringify({
      timestamp: new Date().toISOString(),
      level: 'INFO',
      message,
      ...meta
    }));
  },
  
  error: (message, meta = {}) => {
    console.error(JSON.stringify({
      timestamp: new Date().toISOString(),
      level: 'ERROR',
      message,
      ...meta
    }));
  }
};

// Usage
logger.info('API request', {
  method: 'GET',
  path: '/api/users/123',
  statusCode: 200,
  latencyMs: 45,
  userId: 'user_456',
  requestId: 'req_abc123'
});

logger.error('Request failed', {
  method: 'POST',
  path: '/api/orders',
  statusCode: 500,
  error: error.message,
  stack: error.stack,
  userId: 'user_456'
});

Request/Response Logging Middleware

const requestLogger = (req, res, next) => {
  const startTime = Date.now();
  const requestId = req.headers['x-request-id'] || uuid();
  
  req.requestId = requestId;
  
  // Log request
  logger.info('Incoming request', {
    requestId,
    method: req.method,
    path: req.path,
    query: req.query,
    ip: req.ip,
    userAgent: req.headers['user-agent']
  });
  
  // Capture response
  const originalSend = res.send;
  res.send = function(data) {
    const latency = Date.now() - startTime;
    
    logger.info('Request completed', {
      requestId,
      method: req.method,
      path: req.path,
      statusCode: res.statusCode,
      latencyMs: latency,
      responseSize: res.get('Content-Length')
    });
    
    originalSend.call(this, data);
  };
  
  next();
};

Performance Metrics Collection

Custom Metrics with Prometheus

const promClient = require('prom-client');

const register = new promClient.Registry();

promClient.collectDefaultMetrics({ register });

// Custom metrics
const httpRequestDuration = new promClient.Histogram({
  name: 'http_request_duration_seconds',
  help: 'Duration of HTTP requests in seconds',
  labelNames: ['method', 'route', 'status_code'],
  buckets: [0.01, 0.05, 0.1, 0.5, 1, 2, 5]
});

const httpRequestTotal = new promClient.Counter({
  name: 'http_requests_total',
  help: 'Total number of HTTP requests',
  labelNames: ['method', 'route', 'status_code']
});

const apiResponseSize = new promClient.Gauge({
  name: 'api_response_size_bytes',
  help: 'Response size in bytes',
  labelNames: ['method', 'route']
});

register.registerMetric(httpRequestDuration);
register.registerMetric(httpRequestTotal);
register.registerMetric(apiResponseSize);

// Middleware
app.use((req, res, next) => {
  const start = Date.now();
  
  res.on('finish', () => {
    const duration = (Date.now() - start) / 1000;
    const route = req.route ? req.route.path : req.path;
    
    httpRequestDuration.labels(req.method, route, res.statusCode).observe(duration);
    httpRequestTotal.labels(req.method, route, res.statusCode).inc();
  });
  
  next();
});

app.get('/metrics', async (req, res) => {
  res.set('Content-Type', register.contentType);
  res.end(await register.metrics());
});

Metrics Dashboard

# Grafana dashboard example
dashboard:
  title: "API Performance"
  panels:
    - title: "Requests per Second"
      type: "graph"
      targets:
        - expr: "rate(http_requests_total[5m])"
    
    - title: "Latency (p95)"
      type: "graph"
      targets:
        - expr: "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))"
    
    - title: "Error Rate"
      type: "graph"
      targets:
        - expr: "sum(rate(http_requests_total{status_code=~'5..'}[5m])) / sum(rate(http_requests_total[5m]))"

Error Tracking

Sentry Integration

const Sentry = require('@sentry/node');

Sentry.init({
  dsn: process.env.SENTRY_DSN,
  environment: process.env.NODE_ENV,
  release: process.env.npm_package_version
});

// Request handler
app.use(Sentry.Handlers.requestHandler());

// Error handler
app.use(Sentry.Handlers.errorHandler());

// Manual error capture
app.get('/api/users/:id', async (req, res) => {
  try {
    const user = await getUser(req.params.id);
    if (!user) {
      // Capture with context
      Sentry.captureMessage('User not found', {
        level: 'warning',
        tags: { endpoint: 'getUser', userId: req.params.id }
      });
    }
    res.json(user);
  } catch (error) {
    // Capture with extra context
    Sentry.captureException(error, {
      extra: {
        userId: req.params.id,
        requestId: req.requestId
      }
    });
    throw error;
  }
});

Usage Analytics

API Usage by Client

const usageAnalytics = {
  // Track usage per API key
  async track(key, endpoint, method, statusCode, latency) {
    await redis.hincrby(`usage:${key}:daily`, endpoint, 1);
    await redis.hincrby(`usage:${key}:monthly`, endpoint, 1);
    
    // Track latency percentiles
    await redis.lpush(`latency:${key}:${endpoint}`, latency);
    await redis.ltrim(`latency:${key}:${endpoint}`, 0, 999);
  },
  
  // Get usage report
  async getReport(key, period = 'daily') {
    const usage = await redis.hgetall(`usage:${key}:${period}`);
    const total = Object.values(usage).reduce((a, b) => a + b, 0);
    
    return {
      total,
      byEndpoint: usage,
      period
    };
  }
};

// Middleware
app.use(async (req, res, next) => {
  const start = Date.now();
  
  res.on('finish', async () => {
    const key = req.headers['x-api-key'];
    if (key) {
      await usageAnalytics.track(
        key,
        req.path,
        req.method,
        res.statusCode,
        Date.now() - start
      );
    }
  });
  
  next();
});

Usage Dashboard Data

{
  "period": "2024-01",
  "totalRequests": 1000000,
  "uniqueClients": 500,
  "topEndpoints": [
    { "path": "/api/users", "requests": 250000 },
    { "path": "/api/products", "requests": 180000 },
    { "path": "/api/orders", "requests": 120000 }
  ],
  "errorRate": 0.5,
  "avgLatencyMs": 45,
  "p95LatencyMs": 120,
  "dataTransferMb": 5000
}

Health Checks

Basic Health Endpoint

app.get('/health', (req, res) => {
  res.json({
    status: 'healthy',
    timestamp: new Date().toISOString(),
    uptime: process.uptime(),
    version: process.env.npm_package_version
  });
});

Detailed Health Check

app.get('/health', async (req, res) => {
  const checks = {
    database: await checkDatabase(),
    cache: await checkCache(),
    external: await checkExternalServices()
  };
  
  const allHealthy = Object.values(checks).every(c => c.healthy);
  
  res.status(allHealthy ? 200 : 503).json({
    status: allHealthy ? 'healthy' : 'unhealthy',
    timestamp: new Date().toISOString(),
    checks
  });
});

async function checkDatabase() {
  try {
    await db.query('SELECT 1');
    return { healthy: true, latency: 5 };
  } catch (error) {
    return { healthy: false, error: error.message };
  }
}

Alerting

Alert Rules

# Prometheus alerting rules
groups:
  - name: api_alerts
    rules:
      - alert: HighErrorRate
        expr: sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) > 0.05
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "High error rate detected"
          description: "Error rate is {{ $value }}% over last 5 minutes"
      
      - alert: HighLatency
        expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High latency detected"
          description: "p95 latency is {{ $value }}s"
      
      - alert: RateLimitNear
        expr: rate_limit_remaining / rate_limit_total < 0.1
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "Approaching rate limit"

API Monitoring & Analytics: Complete Guide

API Monitoring & Analytics: Complete Guide

Why API Monitoring Matters

Key Metrics

The Four Golden Signals

Essential API Metrics

Request Logging

Structured Logging

Request/Response Logging Middleware

Performance Metrics Collection

Custom Metrics with Prometheus

Metrics Dashboard

Error Tracking

Sentry Integration

Usage Analytics

API Usage by Client

Usage Dashboard Data

Health Checks

Basic Health Endpoint

Detailed Health Check

Alerting

Alert Rules

External Resources

Comments