Introduction
Metrics are the backbone of observability. From system resource usage to business KPIs, collecting and analyzing time-series data enables alerting, dashboards, and capacity planning.
Key Statistics:
- Prometheus: 70% of Kubernetes monitoring
- InfluxDB: 1M metrics/second ingestion
- Time-series DB market: $4B by 2025
Prometheus Architecture
┌─────────────────────────────────────────────────────────────────┐
│ Prometheus Architecture │
├─────────────────────────────────────────────────────────────────┤
│ │
│ ┌──────────────────────────────────────────────────────────┐ │
│ │ Targets (Scrape Targets) │ │
│ │ ┌────────┐ ┌────────┐ ┌────────┐ ┌────────┐ │ │
│ │ │ Node │ │ App │ │ K8s │ │ App │ │ │
│ │ │Exporter│ │Metrics │ │ cAdvisor│ │SDK │ │ │
│ │ └────────┘ └────────┘ └────────┘ └────────┘ │ │
│ └──────────────────────────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌──────────────────────────────────────────────────────────┐ │
│ │ Prometheus Server │ │
│ │ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ │
│ │ │ TSDB │ │Scrape │ │ Rules │ │ │
│ │ │(Storage)│ │ Engine │ │(Alerts)│ │ │
│ │ └─────────┘ └─────────┘ └─────────┘ │ │
│ └──────────────────────────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌──────────────────────────────────────────────────────────┐ │
│ │ Query & Visualization │ │
│ │ ┌────────┐ ┌────────┐ ┌────────┐ │ │
│ │ │PromQL │ │Grafana │ │Alerts │ │ │
│ │ └────────┘ └────────┘ └────────┘ │ │
│ └──────────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────┘
Prometheus Configuration
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets: ['alertmanager:9093']
rule_files:
- '/etc/prometheus/rules/*.yml'
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'node'
static_configs:
- targets: ['node-exporter:9100']
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
- job_name: 'kubernetes-nodes'
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- action: replace
source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
target_label: __metrics_path__
regex: (.+)
Metric Types
# Counter: Always increasing
# Example: http_requests_total
- name: http_requests_total
type: counter
help: Total HTTP requests
metrics:
- labels:
method: GET
status: 200
value: 15234
# Gauge: Can go up or down
# Example: memory_usage_bytes
- name: memory_usage_bytes
type: gauge
help: Current memory usage
metrics:
- labels:
pod: api-0
value: 536870912
# Histogram: Distributions
# Example: http_request_duration_seconds
- name: http_request_duration_seconds
type: histogram
help: HTTP request duration
buckets: [0.1, 0.25, 0.5, 1.0, 2.5, 5.0]
metrics:
- labels:
method: GET
le: 0.1
value: 12345
# Summary: Percentiles
# Example: rpc_duration_seconds
- name: rpc_duration_seconds
type: summary
quantiles:
- quantile: 0.5
value: 0.05
- quantile: 0.9
value: 0.12
InfluxDB
Data Model
#!/usr/bin/env python3
"""InfluxDB client."""
from influxdb_client import InfluxDBClient, Point
from influxdb_client.client.write_api import SYNCHRONOUS
class MetricsCollector:
"""Collect metrics to InfluxDB."""
def __init__(self, url, token, org):
self.client = InfluxDBClient(url=url, token=token, org=org)
self.write_api = self.client.write_api(write_options=SYNCHRONOUS)
self.query_api = self.client.query_api()
def write_point(self, measurement, tags, fields, timestamp=None):
"""Write a single point."""
point = (
Point(measurement)
.tag("host", tags.get("host"))
.tag("service", tags.get("service"))
.field("value", fields["value"])
)
if timestamp:
point.time(timestamp)
self.write_api.write(bucket="metrics", org="my-org", record=point)
def write_line_protocol(self, line_protocol):
"""Write line protocol directly."""
self.write_api.write(bucket="metrics", org="my-org", record=line_protocol)
def query(self, query):
"""Query metrics."""
return self.query_api.query_data_frame(query)
def create_bucket(self, name, retention_days=30):
"""Create measurement bucket."""
from influxdb_client.client.bucket_api import BucketApi
bucket_api = BucketApi(self.client)
bucket = bucket_api.create_bucket(
bucket_name=name,
org="my-org",
retention_rules=[
{"everySeconds": retention_days * 86400}
]
)
return bucket
Telegraf
# telegraf.conf
[agent]
interval = "10s"
round_interval = true
metric_batch_size = 1000
metric_buffer_limit = 10000
collection_jitter = "0s"
flush_interval = "10s"
flush_jitter = "0s"
# Inputs
[[inputs.cpu]]
percpu = false
totalcpu = true
collect_cpu_time = false
[[inputs.mem]]
[[inputs.disk]]
ignore_fs = ["tmpfs", "devtmpfs", "devfs"]
[[inputs.processes]]
[[inputs.net]]
interfaces = ["eth0"]
[[inputs.kubernetes]]
url = "https://kubernetes.default.svc"
bearer_token = "/var/run/secrets/kubernetes.io/serviceaccount/token"
insecure_skip_verify = true
[[inputs.prometheus]]
urls = ["http://prometheus:9090/metrics"]
[[inputs.mysql]]
servers = ["user:password@tcp(localhost:3306)/"]
# Outputs
[[outputs.influxdb]]
urls = ["http://influxdb:8086"]
database = "telegraf"
retention_policy = "autogen"
[[outputs.elasticsearch]]
urls = ["http://elasticsearch:9200"]
index_name = "telegraf"
# Processors
[[processors.rename]]
[[processors.rename.rename]]
old = "host"
dest = "hostname"
Comments