Monitoring and Logging

Monitoring and logging are critical for maintaining production applications. This article covers observability best practices.

Introduction

Monitoring and logging provide:

System visibility
Performance tracking
Error detection
Debugging capabilities
Alerting

Understanding monitoring helps you:

Track application health
Detect issues early
Debug problems
Optimize performance
Maintain reliability

Logging

Logging Frameworks

// ✅ Good: Winston logger setup
const winston = require('winston');

const logger = winston.createLogger({
  level: process.env.LOG_LEVEL || 'info',
  format: winston.format.combine(
    winston.format.timestamp(),
    winston.format.errors({ stack: true }),
    winston.format.json()
  ),
  defaultMeta: { service: 'my-app' },
  transports: [
    new winston.transports.File({ filename: 'error.log', level: 'error' }),
    new winston.transports.File({ filename: 'combined.log' })
  ]
});

if (process.env.NODE_ENV !== 'production') {
  logger.add(new winston.transports.Console({
    format: winston.format.simple()
  }));
}

// ✅ Good: Log different levels
logger.error('Error message', { error: err });
logger.warn('Warning message');
logger.info('Info message');
logger.debug('Debug message');

// ✅ Good: Structured logging
logger.info('User login', {
  userId: user.id,
  email: user.email,
  timestamp: new Date(),
  ip: req.ip
});

Log Aggregation

// ✅ Good: Send logs to ELK Stack
const winston = require('winston');
require('winston-elasticsearch');

const logger = winston.createLogger({
  transports: [
    new winston.transports.elasticsearch({
      level: 'info',
      clientOpts: { node: 'http://localhost:9200' },
      index: 'logs'
    })
  ]
});

// ✅ Good: Send logs to Datadog
const winston = require('winston');
const WinstonDatadog = require('winston-datadog');

const logger = winston.createLogger({
  transports: [
    new WinstonDatadog({
      apiKey: process.env.DATADOG_API_KEY,
      hostname: 'my-app',
      service: 'my-service',
      ddsource: 'nodejs'
    })
  ]
});

// ✅ Good: Send logs to CloudWatch
const winston = require('winston');
const WinstonCloudWatch = require('winston-cloudwatch');

const logger = winston.createLogger({
  transports: [
    new WinstonCloudWatch({
      logGroupName: '/aws/lambda/my-app',
      logStreamName: 'production',
      awsRegion: 'us-east-1'
    })
  ]
});

Metrics and Monitoring

Prometheus Metrics

// ✅ Good: Install Prometheus client
// npm install prom-client

const prometheus = require('prom-client');

// Create metrics
const httpRequestDuration = new prometheus.Histogram({
  name: 'http_request_duration_seconds',
  help: 'Duration of HTTP requests in seconds',
  labelNames: ['method', 'route', 'status_code']
});

const activeConnections = new prometheus.Gauge({
  name: 'active_connections',
  help: 'Number of active connections'
});

// ✅ Good: Middleware to track metrics
app.use((req, res, next) => {
  const start = Date.now();

  res.on('finish', () => {
    const duration = (Date.now() - start) / 1000;
    httpRequestDuration
      .labels(req.method, req.route?.path || req.path, res.statusCode)
      .observe(duration);
  });

  next();
});

// ✅ Good: Expose metrics endpoint
app.get('/metrics', (req, res) => {
  res.set('Content-Type', prometheus.register.contentType);
  res.end(prometheus.register.metrics());
});

// ✅ Good: Custom metrics
const userCounter = new prometheus.Counter({
  name: 'users_created_total',
  help: 'Total number of users created'
});

app.post('/users', (req, res) => {
  // Create user
  userCounter.inc();
  res.json({ message: 'User created' });
});

Application Performance Monitoring

// ✅ Good: New Relic APM
const newrelic = require('newrelic');

// ✅ Good: Datadog APM
const tracer = require('dd-trace').init();

// ✅ Good: Custom performance tracking
const performanceMetrics = {
  requests: 0,
  errors: 0,
  avgResponseTime: 0
};

app.use((req, res, next) => {
  const start = Date.now();

  res.on('finish', () => {
    const duration = Date.now() - start;
    performanceMetrics.requests++;
    performanceMetrics.avgResponseTime = 
      (performanceMetrics.avgResponseTime + duration) / 2;

    if (res.statusCode >= 400) {
      performanceMetrics.errors++;
    }
  });

  next();
});

app.get('/performance', (req, res) => {
  res.json(performanceMetrics);
});

Alerting

Alert Configuration

// ✅ Good: Alert on high error rate
const checkErrorRate = () => {
  const errorRate = performanceMetrics.errors / performanceMetrics.requests;
  
  if (errorRate > 0.05) { // 5% error rate
    sendAlert('High error rate detected', {
      errorRate,
      errors: performanceMetrics.errors,
      requests: performanceMetrics.requests
    });
  }
};

// ✅ Good: Alert on slow response time
const checkResponseTime = () => {
  if (performanceMetrics.avgResponseTime > 1000) { // 1 second
    sendAlert('Slow response time detected', {
      avgResponseTime: performanceMetrics.avgResponseTime
    });
  }
};

// ✅ Good: Alert on high memory usage
const checkMemory = () => {
  const memUsage = process.memoryUsage();
  const heapUsedPercent = (memUsage.heapUsed / memUsage.heapTotal) * 100;
  
  if (heapUsedPercent > 90) {
    sendAlert('High memory usage', {
      heapUsedPercent,
      heapUsed: memUsage.heapUsed,
      heapTotal: memUsage.heapTotal
    });
  }
};

// ✅ Good: Run checks periodically
setInterval(() => {
  checkErrorRate();
  checkResponseTime();
  checkMemory();
}, 60000); // Every minute

// ✅ Good: Send alerts
async function sendAlert(title, data) {
  // Send to Slack
  await fetch(process.env.SLACK_WEBHOOK, {
    method: 'POST',
    body: JSON.stringify({
      text: title,
      attachments: [{
        text: JSON.stringify(data, null, 2)
      }]
    })
  });

  // Send to PagerDuty
  await fetch('https://events.pagerduty.com/v2/enqueue', {
    method: 'POST',
    body: JSON.stringify({
      routing_key: process.env.PAGERDUTY_KEY,
      event_action: 'trigger',
      payload: {
        summary: title,
        severity: 'critical',
        source: 'my-app'
      }
    })
  });
}

Distributed Tracing

Tracing Setup

// ✅ Good: Jaeger tracing
const initTracer = require('jaeger-client').initTracer;

const tracer = initTracer({
  serviceName: 'my-app',
  sampler: {
    type: 'const',
    param: 1
  },
  reporter: {
    logSpans: true,
    agentHost: 'localhost',
    agentPort: 6831
  }
}, {
  logger: console
});

// ✅ Good: Create spans
app.get('/api/users/:id', (req, res) => {
  const span = tracer.startSpan('get-user');
  
  try {
    const user = getUser(req.params.id);
    span.setTag('user.id', user.id);
    res.json(user);
  } catch (err) {
    span.setTag('error', true);
    span.log({ event: 'error', message: err.message });
    res.status(500).json({ error: 'Server error' });
  } finally {
    span.finish();
  }
});

// ✅ Good: Trace async operations
async function fetchUserData(userId) {
  const span = tracer.startSpan('fetch-user-data');
  
  try {
    const user = await User.findById(userId);
    const posts = await Post.find({ userId });
    
    span.setTag('user.id', userId);
    span.setTag('posts.count', posts.length);
    
    return { user, posts };
  } finally {
    span.finish();
  }
}

Health Checks

Health Check Endpoints

// ✅ Good: Liveness probe
app.get('/health/live', (req, res) => {
  res.json({
    status: 'alive',
    timestamp: new Date()
  });
});

// ✅ Good: Readiness probe
app.get('/health/ready', async (req, res) => {
  try {
    // Check database
    await db.ping();
    
    // Check cache
    await redis.ping();
    
    res.json({
      status: 'ready',
      database: 'ok',
      cache: 'ok'
    });
  } catch (err) {
    res.status(503).json({
      status: 'not-ready',
      error: err.message
    });
  }
});

// ✅ Good: Startup probe
app.get('/health/startup', async (req, res) => {
  try {
    // Check if application is fully initialized
    if (!isInitialized) {
      return res.status(503).json({ status: 'starting' });
    }
    
    res.json({ status: 'started' });
  } catch (err) {
    res.status(503).json({ error: err.message });
  }
});

Best Practices

Log at appropriate levels:

// ✅ Good: Appropriate log levels
logger.error('Database connection failed');
logger.warn('Deprecated API used');
logger.info('User logged in');
logger.debug('Query executed');

// ❌ Bad: Wrong levels
logger.error('User logged in');
logger.info('Database connection failed');

Include context in logs:

// ✅ Good: Include context
logger.error('Request failed', {
  userId: req.user?.id,
  requestId: req.id,
  path: req.path,
  error: err.message
});

// ❌ Bad: No context
logger.error('Request failed');

Monitor key metrics:

// ✅ Good: Monitor key metrics
// - Response time
// - Error rate
// - Memory usage
// - CPU usage
// - Database connections

// ❌ Bad: No monitoring

Summary

Monitoring and logging are essential. Key takeaways:

Implement structured logging
Aggregate logs
Track metrics
Set up alerts
Use distributed tracing
Implement health checks
Monitor key metrics
Maintain observability

Next Steps

Learn about Performance Testing
Explore Testing & QA
Study Integration Testing
Practice monitoring
Build observable systems