Skip to main content
โšก Calmops

API Monitoring & Analytics: Complete Guide

API Monitoring & Analytics: Complete Guide

Monitoring and analytics are essential for maintaining reliable, high-performance APIs. This guide covers metrics collection, logging, error tracking, and building observability into your APIs.

Why API Monitoring Matters

  • Detect issues before they impact users
  • Understand API usage patterns
  • Optimize performance
  • Plan capacity and scaling
  • Meet SLA requirements
  • Improve developer experience

Key Metrics

The Four Golden Signals

  1. Latency - How long requests take
  2. Traffic - How much demand there is
  3. Errors - How often requests fail
  4. Saturation - How close to capacity

Essential API Metrics

// Request metrics
- Total requests (count)
- Requests per second
- Request latency (p50, p95, p99)
- Request size (bytes)
- Response size (bytes)

// Error metrics  
- Error rate (percentage)
- Error by status code
- Error by endpoint

// Performance metrics
- Throughput (requests/second)
- Response time
- Time to first byte (TTFB)
- DNS lookup time
- TLS handshake time

Request Logging

Structured Logging

// Good: Structured JSON logging
const logger = {
  info: (message, meta = {}) => {
    console.log(JSON.stringify({
      timestamp: new Date().toISOString(),
      level: 'INFO',
      message,
      ...meta
    }));
  },
  
  error: (message, meta = {}) => {
    console.error(JSON.stringify({
      timestamp: new Date().toISOString(),
      level: 'ERROR',
      message,
      ...meta
    }));
  }
};

// Usage
logger.info('API request', {
  method: 'GET',
  path: '/api/users/123',
  statusCode: 200,
  latencyMs: 45,
  userId: 'user_456',
  requestId: 'req_abc123'
});

logger.error('Request failed', {
  method: 'POST',
  path: '/api/orders',
  statusCode: 500,
  error: error.message,
  stack: error.stack,
  userId: 'user_456'
});

Request/Response Logging Middleware

const requestLogger = (req, res, next) => {
  const startTime = Date.now();
  const requestId = req.headers['x-request-id'] || uuid();
  
  req.requestId = requestId;
  
  // Log request
  logger.info('Incoming request', {
    requestId,
    method: req.method,
    path: req.path,
    query: req.query,
    ip: req.ip,
    userAgent: req.headers['user-agent']
  });
  
  // Capture response
  const originalSend = res.send;
  res.send = function(data) {
    const latency = Date.now() - startTime;
    
    logger.info('Request completed', {
      requestId,
      method: req.method,
      path: req.path,
      statusCode: res.statusCode,
      latencyMs: latency,
      responseSize: res.get('Content-Length')
    });
    
    originalSend.call(this, data);
  };
  
  next();
};

Performance Metrics Collection

Custom Metrics with Prometheus

const promClient = require('prom-client');

const register = new promClient.Registry();

promClient.collectDefaultMetrics({ register });

// Custom metrics
const httpRequestDuration = new promClient.Histogram({
  name: 'http_request_duration_seconds',
  help: 'Duration of HTTP requests in seconds',
  labelNames: ['method', 'route', 'status_code'],
  buckets: [0.01, 0.05, 0.1, 0.5, 1, 2, 5]
});

const httpRequestTotal = new promClient.Counter({
  name: 'http_requests_total',
  help: 'Total number of HTTP requests',
  labelNames: ['method', 'route', 'status_code']
});

const apiResponseSize = new promClient.Gauge({
  name: 'api_response_size_bytes',
  help: 'Response size in bytes',
  labelNames: ['method', 'route']
});

register.registerMetric(httpRequestDuration);
register.registerMetric(httpRequestTotal);
register.registerMetric(apiResponseSize);

// Middleware
app.use((req, res, next) => {
  const start = Date.now();
  
  res.on('finish', () => {
    const duration = (Date.now() - start) / 1000;
    const route = req.route ? req.route.path : req.path;
    
    httpRequestDuration.labels(req.method, route, res.statusCode).observe(duration);
    httpRequestTotal.labels(req.method, route, res.statusCode).inc();
  });
  
  next();
});

app.get('/metrics', async (req, res) => {
  res.set('Content-Type', register.contentType);
  res.end(await register.metrics());
});

Metrics Dashboard

# Grafana dashboard example
dashboard:
  title: "API Performance"
  panels:
    - title: "Requests per Second"
      type: "graph"
      targets:
        - expr: "rate(http_requests_total[5m])"
    
    - title: "Latency (p95)"
      type: "graph"
      targets:
        - expr: "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))"
    
    - title: "Error Rate"
      type: "graph"
      targets:
        - expr: "sum(rate(http_requests_total{status_code=~'5..'}[5m])) / sum(rate(http_requests_total[5m]))"

Error Tracking

Sentry Integration

const Sentry = require('@sentry/node');

Sentry.init({
  dsn: process.env.SENTRY_DSN,
  environment: process.env.NODE_ENV,
  release: process.env.npm_package_version
});

// Request handler
app.use(Sentry.Handlers.requestHandler());

// Error handler
app.use(Sentry.Handlers.errorHandler());

// Manual error capture
app.get('/api/users/:id', async (req, res) => {
  try {
    const user = await getUser(req.params.id);
    if (!user) {
      // Capture with context
      Sentry.captureMessage('User not found', {
        level: 'warning',
        tags: { endpoint: 'getUser', userId: req.params.id }
      });
    }
    res.json(user);
  } catch (error) {
    // Capture with extra context
    Sentry.captureException(error, {
      extra: {
        userId: req.params.id,
        requestId: req.requestId
      }
    });
    throw error;
  }
});

Usage Analytics

API Usage by Client

const usageAnalytics = {
  // Track usage per API key
  async track(key, endpoint, method, statusCode, latency) {
    await redis.hincrby(`usage:${key}:daily`, endpoint, 1);
    await redis.hincrby(`usage:${key}:monthly`, endpoint, 1);
    
    // Track latency percentiles
    await redis.lpush(`latency:${key}:${endpoint}`, latency);
    await redis.ltrim(`latency:${key}:${endpoint}`, 0, 999);
  },
  
  // Get usage report
  async getReport(key, period = 'daily') {
    const usage = await redis.hgetall(`usage:${key}:${period}`);
    const total = Object.values(usage).reduce((a, b) => a + b, 0);
    
    return {
      total,
      byEndpoint: usage,
      period
    };
  }
};

// Middleware
app.use(async (req, res, next) => {
  const start = Date.now();
  
  res.on('finish', async () => {
    const key = req.headers['x-api-key'];
    if (key) {
      await usageAnalytics.track(
        key,
        req.path,
        req.method,
        res.statusCode,
        Date.now() - start
      );
    }
  });
  
  next();
});

Usage Dashboard Data

{
  "period": "2024-01",
  "totalRequests": 1000000,
  "uniqueClients": 500,
  "topEndpoints": [
    { "path": "/api/users", "requests": 250000 },
    { "path": "/api/products", "requests": 180000 },
    { "path": "/api/orders", "requests": 120000 }
  ],
  "errorRate": 0.5,
  "avgLatencyMs": 45,
  "p95LatencyMs": 120,
  "dataTransferMb": 5000
}

Health Checks

Basic Health Endpoint

app.get('/health', (req, res) => {
  res.json({
    status: 'healthy',
    timestamp: new Date().toISOString(),
    uptime: process.uptime(),
    version: process.env.npm_package_version
  });
});

Detailed Health Check

app.get('/health', async (req, res) => {
  const checks = {
    database: await checkDatabase(),
    cache: await checkCache(),
    external: await checkExternalServices()
  };
  
  const allHealthy = Object.values(checks).every(c => c.healthy);
  
  res.status(allHealthy ? 200 : 503).json({
    status: allHealthy ? 'healthy' : 'unhealthy',
    timestamp: new Date().toISOString(),
    checks
  });
});

async function checkDatabase() {
  try {
    await db.query('SELECT 1');
    return { healthy: true, latency: 5 };
  } catch (error) {
    return { healthy: false, error: error.message };
  }
}

Alerting

Alert Rules

# Prometheus alerting rules
groups:
  - name: api_alerts
    rules:
      - alert: HighErrorRate
        expr: sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) > 0.05
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "High error rate detected"
          description: "Error rate is {{ $value }}% over last 5 minutes"
      
      - alert: HighLatency
        expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High latency detected"
          description: "p95 latency is {{ $value }}s"
      
      - alert: RateLimitNear
        expr: rate_limit_remaining / rate_limit_total < 0.1
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "Approaching rate limit"

External Resources


Comments