Monitoring and Logging
Monitoring and logging are critical for maintaining production applications. This article covers observability best practices.
Introduction
Monitoring and logging provide:
- System visibility
- Performance tracking
- Error detection
- Debugging capabilities
- Alerting
Understanding monitoring helps you:
- Track application health
- Detect issues early
- Debug problems
- Optimize performance
- Maintain reliability
Logging
Logging Frameworks
// โ
Good: Winston logger setup
const winston = require('winston');
const logger = winston.createLogger({
level: process.env.LOG_LEVEL || 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.errors({ stack: true }),
winston.format.json()
),
defaultMeta: { service: 'my-app' },
transports: [
new winston.transports.File({ filename: 'error.log', level: 'error' }),
new winston.transports.File({ filename: 'combined.log' })
]
});
if (process.env.NODE_ENV !== 'production') {
logger.add(new winston.transports.Console({
format: winston.format.simple()
}));
}
// โ
Good: Log different levels
logger.error('Error message', { error: err });
logger.warn('Warning message');
logger.info('Info message');
logger.debug('Debug message');
// โ
Good: Structured logging
logger.info('User login', {
userId: user.id,
email: user.email,
timestamp: new Date(),
ip: req.ip
});
Log Aggregation
// โ
Good: Send logs to ELK Stack
const winston = require('winston');
require('winston-elasticsearch');
const logger = winston.createLogger({
transports: [
new winston.transports.elasticsearch({
level: 'info',
clientOpts: { node: 'http://localhost:9200' },
index: 'logs'
})
]
});
// โ
Good: Send logs to Datadog
const winston = require('winston');
const WinstonDatadog = require('winston-datadog');
const logger = winston.createLogger({
transports: [
new WinstonDatadog({
apiKey: process.env.DATADOG_API_KEY,
hostname: 'my-app',
service: 'my-service',
ddsource: 'nodejs'
})
]
});
// โ
Good: Send logs to CloudWatch
const winston = require('winston');
const WinstonCloudWatch = require('winston-cloudwatch');
const logger = winston.createLogger({
transports: [
new WinstonCloudWatch({
logGroupName: '/aws/lambda/my-app',
logStreamName: 'production',
awsRegion: 'us-east-1'
})
]
});
Metrics and Monitoring
Prometheus Metrics
// โ
Good: Install Prometheus client
// npm install prom-client
const prometheus = require('prom-client');
// Create metrics
const httpRequestDuration = new prometheus.Histogram({
name: 'http_request_duration_seconds',
help: 'Duration of HTTP requests in seconds',
labelNames: ['method', 'route', 'status_code']
});
const activeConnections = new prometheus.Gauge({
name: 'active_connections',
help: 'Number of active connections'
});
// โ
Good: Middleware to track metrics
app.use((req, res, next) => {
const start = Date.now();
res.on('finish', () => {
const duration = (Date.now() - start) / 1000;
httpRequestDuration
.labels(req.method, req.route?.path || req.path, res.statusCode)
.observe(duration);
});
next();
});
// โ
Good: Expose metrics endpoint
app.get('/metrics', (req, res) => {
res.set('Content-Type', prometheus.register.contentType);
res.end(prometheus.register.metrics());
});
// โ
Good: Custom metrics
const userCounter = new prometheus.Counter({
name: 'users_created_total',
help: 'Total number of users created'
});
app.post('/users', (req, res) => {
// Create user
userCounter.inc();
res.json({ message: 'User created' });
});
Application Performance Monitoring
// โ
Good: New Relic APM
const newrelic = require('newrelic');
// โ
Good: Datadog APM
const tracer = require('dd-trace').init();
// โ
Good: Custom performance tracking
const performanceMetrics = {
requests: 0,
errors: 0,
avgResponseTime: 0
};
app.use((req, res, next) => {
const start = Date.now();
res.on('finish', () => {
const duration = Date.now() - start;
performanceMetrics.requests++;
performanceMetrics.avgResponseTime =
(performanceMetrics.avgResponseTime + duration) / 2;
if (res.statusCode >= 400) {
performanceMetrics.errors++;
}
});
next();
});
app.get('/performance', (req, res) => {
res.json(performanceMetrics);
});
Alerting
Alert Configuration
// โ
Good: Alert on high error rate
const checkErrorRate = () => {
const errorRate = performanceMetrics.errors / performanceMetrics.requests;
if (errorRate > 0.05) { // 5% error rate
sendAlert('High error rate detected', {
errorRate,
errors: performanceMetrics.errors,
requests: performanceMetrics.requests
});
}
};
// โ
Good: Alert on slow response time
const checkResponseTime = () => {
if (performanceMetrics.avgResponseTime > 1000) { // 1 second
sendAlert('Slow response time detected', {
avgResponseTime: performanceMetrics.avgResponseTime
});
}
};
// โ
Good: Alert on high memory usage
const checkMemory = () => {
const memUsage = process.memoryUsage();
const heapUsedPercent = (memUsage.heapUsed / memUsage.heapTotal) * 100;
if (heapUsedPercent > 90) {
sendAlert('High memory usage', {
heapUsedPercent,
heapUsed: memUsage.heapUsed,
heapTotal: memUsage.heapTotal
});
}
};
// โ
Good: Run checks periodically
setInterval(() => {
checkErrorRate();
checkResponseTime();
checkMemory();
}, 60000); // Every minute
// โ
Good: Send alerts
async function sendAlert(title, data) {
// Send to Slack
await fetch(process.env.SLACK_WEBHOOK, {
method: 'POST',
body: JSON.stringify({
text: title,
attachments: [{
text: JSON.stringify(data, null, 2)
}]
})
});
// Send to PagerDuty
await fetch('https://events.pagerduty.com/v2/enqueue', {
method: 'POST',
body: JSON.stringify({
routing_key: process.env.PAGERDUTY_KEY,
event_action: 'trigger',
payload: {
summary: title,
severity: 'critical',
source: 'my-app'
}
})
});
}
Distributed Tracing
Tracing Setup
// โ
Good: Jaeger tracing
const initTracer = require('jaeger-client').initTracer;
const tracer = initTracer({
serviceName: 'my-app',
sampler: {
type: 'const',
param: 1
},
reporter: {
logSpans: true,
agentHost: 'localhost',
agentPort: 6831
}
}, {
logger: console
});
// โ
Good: Create spans
app.get('/api/users/:id', (req, res) => {
const span = tracer.startSpan('get-user');
try {
const user = getUser(req.params.id);
span.setTag('user.id', user.id);
res.json(user);
} catch (err) {
span.setTag('error', true);
span.log({ event: 'error', message: err.message });
res.status(500).json({ error: 'Server error' });
} finally {
span.finish();
}
});
// โ
Good: Trace async operations
async function fetchUserData(userId) {
const span = tracer.startSpan('fetch-user-data');
try {
const user = await User.findById(userId);
const posts = await Post.find({ userId });
span.setTag('user.id', userId);
span.setTag('posts.count', posts.length);
return { user, posts };
} finally {
span.finish();
}
}
Health Checks
Health Check Endpoints
// โ
Good: Liveness probe
app.get('/health/live', (req, res) => {
res.json({
status: 'alive',
timestamp: new Date()
});
});
// โ
Good: Readiness probe
app.get('/health/ready', async (req, res) => {
try {
// Check database
await db.ping();
// Check cache
await redis.ping();
res.json({
status: 'ready',
database: 'ok',
cache: 'ok'
});
} catch (err) {
res.status(503).json({
status: 'not-ready',
error: err.message
});
}
});
// โ
Good: Startup probe
app.get('/health/startup', async (req, res) => {
try {
// Check if application is fully initialized
if (!isInitialized) {
return res.status(503).json({ status: 'starting' });
}
res.json({ status: 'started' });
} catch (err) {
res.status(503).json({ error: err.message });
}
});
Best Practices
-
Log at appropriate levels:
// โ Good: Appropriate log levels logger.error('Database connection failed'); logger.warn('Deprecated API used'); logger.info('User logged in'); logger.debug('Query executed'); // โ Bad: Wrong levels logger.error('User logged in'); logger.info('Database connection failed'); -
Include context in logs:
// โ Good: Include context logger.error('Request failed', { userId: req.user?.id, requestId: req.id, path: req.path, error: err.message }); // โ Bad: No context logger.error('Request failed'); -
Monitor key metrics:
// โ Good: Monitor key metrics // - Response time // - Error rate // - Memory usage // - CPU usage // - Database connections // โ Bad: No monitoring
Summary
Monitoring and logging are essential. Key takeaways:
- Implement structured logging
- Aggregate logs
- Track metrics
- Set up alerts
- Use distributed tracing
- Implement health checks
- Monitor key metrics
- Maintain observability
Related Resources
Next Steps
- Learn about Performance Testing
- Explore Testing & QA
- Study Integration Testing
- Practice monitoring
- Build observable systems
Comments