Introduction
ClickHouse excels in analytical scenarios where query speed on massive datasets is critical. From web analytics to IoT monitoring, ClickHouse powers real-time decision-making at companies handling billions of events daily.
This article explores practical ClickHouse use cases, implementation patterns, and best practices for production deployments.
Web Analytics
Page View Tracking
-- Create page views table
CREATE TABLE page_views (
user_id UInt64,
session_id UUID,
page_url String,
referrer_url String,
country LowCardinality(String),
device_type LowCardinality(String),
browser LowCardinality(String),
timestamp DateTime,
load_time Float32,
is_bounce UInt8
) ENGINE = MergeTree()
ORDER BY (country, timestamp, user_id)
PARTITION BY toYYYYMM(timestamp);
-- Query examples
-- Daily page views by country
SELECT
toDate(timestamp) as date,
country,
count() as page_views,
uniq(user_id) as unique_users,
avg(load_time) as avg_load_time
FROM page_views
WHERE timestamp >= '2026-01-01'
GROUP BY date, country
ORDER BY date, page_views DESC;
-- Bounce rate analysis
SELECT
page_url,
count() as total_visits,
sum(is_bounce) as bounces,
round(sum(is_bounce) / count() * 100, 2) as bounce_rate
FROM page_views
WHERE timestamp >= '2026-01-01'
GROUP BY page_url
HAVING total_visits > 100
ORDER BY bounce_rate DESC
LIMIT 20;
User Session Analysis
-- Session funnel analysis
SELECT
funnel_step,
count() as users,
round(count() / lagInFrame(count()) over (ORDER BY funnel_step) * 100, 1) as conversion_rate
FROM (
SELECT
session_id,
groupArray(page_url) as pages,
'step1' as funnel_step
FROM page_views
WHERE timestamp >= '2026-01-01'
GROUP BY session_id
HAVING has(pages, '/signup')
UNION ALL
SELECT
session_id,
groupArray(page_url) as pages,
'step2' as funnel_step
FROM page_views
WHERE timestamp >= '2026-01-01'
GROUP BY session_id
HAVING has(pages, '/pricing')
UNION ALL
SELECT
session_id,
groupArray(page_url) as pages,
'step3' as funnel_step
FROM page_views
WHERE timestamp >= '2026-01-01'
GROUP BY session_id
HAVING has(pages, '/thank-you')
)
GROUP BY funnel_step;
Application Monitoring
Performance Monitoring
-- Create metrics table
CREATE TABLE app_metrics (
service String,
endpoint String,
status UInt16,
response_time Float32,
timestamp DateTime,
host LowCardinality(String),
environment LowCardinality(String)
) ENGINE = MergeTree()
ORDER BY (service, endpoint, timestamp)
PARTITION BY toYYYYMM(timestamp)
TTL timestamp + INTERVAL 30 DAY;
-- Performance alerts
SELECT
service,
endpoint,
quantile(0.95)(response_time) as p95,
quantile(0.99)(response_time) as p99,
max(response_time) as max_response,
count() as total_requests,
sum(status >= 500) as errors
FROM app_metrics
WHERE timestamp >= now() - INTERVAL 1 HOUR
GROUP BY service, endpoint
HAVING errors > 10 OR p99 > 5
ORDER BY errors DESC;
Error Tracking
-- Error aggregation
SELECT
toStartOfMinute(timestamp) as minute,
service,
error_type,
count() as error_count
FROM app_logs
WHERE timestamp >= now() - INTERVAL 1 HOUR
AND level = 'ERROR'
GROUP BY minute, service, error_type
ORDER BY minute, error_count DESC;
IoT and Sensors
Sensor Data
-- Create sensor data table
CREATE TABLE sensor_data (
sensor_id String,
location String,
temperature Nullable(Float32),
humidity Nullable(Float32),
pressure Nullable(Float32),
battery Float32,
timestamp DateTime,
metadata String
) ENGINE = MergeTree()
ORDER BY (sensor_id, timestamp)
PARTITION BY toYYYYMMDD(timestamp);
-- Real-time aggregation
SELECT
sensor_id,
location,
avg(temperature) as avg_temp,
max(temperature) as max_temp,
min(temperature) as min_temp,
count() as readings,
avg(battery) as avg_battery
FROM sensor_data
WHERE timestamp >= now() - INTERVAL 1 HOUR
GROUP BY sensor_id, location
ORDER BY location, sensor_id;
-- Anomaly detection
SELECT
sensor_id,
timestamp,
temperature,
avg_temp,
abs(temperature - avg_temp) / stddevPop(temperature) as z_score
FROM (
SELECT
sensor_id,
timestamp,
temperature,
avg(temperature) OVER (PARTITION BY sensor_id
ORDER BY timestamp
ROWS BETWEEN 100 PRECEDING AND CURRENT ROW) as avg_temp,
stddevPop(temperature) OVER (PARTITION BY sensor_id
ORDER BY timestamp
ROWS BETWEEN 100 PRECEDING AND CURRENT ROW) as stddev_temp
FROM sensor_data
WHERE timestamp >= now() - INTERVAL 1 DAY
)
WHERE abs(temperature - avg_temp) / stddev_temp > 3;
Business Intelligence
Revenue Analytics
-- Revenue dashboard
SELECT
toDate(timestamp) as date,
product_category,
country,
count() as transactions,
sum(amount) as revenue,
avg(amount) as avg_order_value,
sum(amount) / count(DISTINCT user_id) as revenue_per_user
FROM transactions
WHERE timestamp >= '2026-01-01'
GROUP BY date, product_category, country
ORDER BY date DESC, revenue DESC;
-- Cohort analysis
WITH users AS (
SELECT
user_id,
min(toDate(timestamp)) as cohort_date
FROM transactions
GROUP BY user_id
)
SELECT
toMonth(cohort_date) as cohort_month,
dateDiff('month', cohort_date, toDate(timestamp)) as month_number,
count(DISTINCT t.user_id) as active_users,
sum(t.amount) as revenue
FROM transactions t
JOIN users u ON t.user_id = u.user_id
GROUP BY 1, 2
ORDER BY 1, 2;
Real-time KPIs
import clickhouse_connect
class RealtimeKPI:
"""Real-time KPI calculations."""
def __init__(self, config):
self.client = clickhouse_connect.get_client(**config)
def get_kpis(self):
"""Calculate key KPIs."""
# Revenue today
revenue = self.client.query("""
SELECT sum(amount) FROM transactions
WHERE toDate(timestamp) = today()
""").result_rows[0][0]
# Orders today
orders = self.client.query("""
SELECT count() FROM transactions
WHERE toDate(timestamp) = today()
""").result_rows[0][0]
# Active users (last 7 days)
users = self.client.query("""
SELECT uniq(user_id) FROM events
WHERE timestamp >= now() - INTERVAL 7 DAY
""").result_rows[0][0]
return {
'revenue_today': revenue or 0,
'orders_today': orders or 0,
'active_users_7d': users or 0
}
def get_trends(self, days=30):
"""Get daily trends."""
return self.client.query(f"""
SELECT
toDate(timestamp) as date,
count() as events,
sum(amount) as revenue
FROM transactions
WHERE timestamp >= today() - INTERVAL {days} DAY
GROUP BY date
ORDER BY date
""").result_set.to_pandas()
# Usage
kpi = RealtimeKPI({'host': 'localhost'})
print(kpi.get_kpis())
Log Management
Application Logs
-- Create logs table
CREATE TABLE logs (
timestamp DateTime,
level String,
service String,
message String,
trace_id String,
user_id Nullable(UInt64),
metadata String
) ENGINE = MergeTree()
ORDER BY (service, timestamp, level)
PARTITION BY toYYYYMM(timestamp)
TTL timestamp + INTERVAL 7 DAY;
-- Log search
SELECT timestamp, level, service, message
FROM logs
WHERE timestamp >= now() - INTERVAL 1 HOUR
AND service = 'api'
AND level IN ('ERROR', 'FATAL')
ORDER BY timestamp DESC
LIMIT 100;
-- Error patterns
SELECT
extract(message, 'Error: (.+)') as error_pattern,
count() as occurrences
FROM logs
WHERE timestamp >= now() - INTERVAL 1 DAY
AND level = 'ERROR'
GROUP BY error_pattern
ORDER BY occurrences DESC
LIMIT 20;
Time Series Analysis
Stock/Financial Data
-- Create financial data table
CREATE TABLE stock_prices (
symbol String,
timestamp DateTime,
open Float32,
high Float32,
low Float32,
close Float32,
volume UInt64
) ENGINE = MergeTree()
ORDER BY (symbol, timestamp);
-- Technical indicators
SELECT
symbol,
timestamp,
close,
avg(close) OVER (
PARTITION BY symbol
ORDER BY timestamp
ROWS BETWEEN 19 PRECEDING AND CURRENT ROW
) as sma_20,
close - avg(close) OVER (
PARTITION BY symbol
ORDER BY timestamp
ROWS BETWEEN 19 PRECEDING AND CURRENT ROW
) as price_diff
FROM stock_prices
WHERE symbol = 'AAPL'
ORDER BY timestamp;
-- Volatility calculation
SELECT
symbol,
toStartOfDay(timestamp) as date,
stddevPop(close) as volatility,
max(high) - min(low) as daily_range
FROM stock_prices
WHERE timestamp >= '2026-01-01'
GROUP BY symbol, date
ORDER BY volatility DESC
LIMIT 10;
Ad Tech / Real-Time Bidding
Bid Analysis
-- Bid stream analysis
CREATE TABLE bids (
timestamp DateTime,
auction_id UUID,
bidder_id UInt64,
impression_id UUID,
campaign_id UInt64,
bid_amount Float32,
won UInt8,
win_amount Nullable(Float32),
country LowCardinality(String)
) ENGINE = MergeTree()
ORDER BY (timestamp, auction_id)
PARTITION BY toYYYYMMDD(timestamp);
-- Win rate by campaign
SELECT
campaign_id,
count() as bids,
sum(won) as wins,
round(sum(won) / count() * 100, 2) as win_rate,
sum(win_amount) / sum(won) as avg_win_price
FROM bids
WHERE timestamp >= now() - INTERVAL 1 DAY
GROUP BY campaign_id
HAVING bids > 100
ORDER BY win_rate DESC;
-- Revenue optimization
SELECT
country,
sum(win_amount) as revenue,
count() as impressions,
round(sum(win_amount) / count() * 1000, 2) as cpm
FROM bids
WHERE won = 1
AND timestamp >= now() - INTERVAL 7 DAY
GROUP BY country
ORDER BY revenue DESC;
Best Practices Summary
When to Use ClickHouse
| Use Case | Recommendation |
|---|---|
| Web analytics | โ Excellent |
| Application monitoring | โ Excellent |
| IoT/sensors | โ Excellent |
| Business intelligence | โ Excellent |
| Real-time dashboards | โ Excellent |
| Transactional (OLTP) | โ Not suitable |
| Full-text search | โ ๏ธ Limited |
| Graph data | โ Not suitable |
Performance Tips
-- Partition by time
PARTITION BY toYYYYMM(timestamp)
-- Good primary key
ORDER BY (event_type, timestamp, user_id)
-- Batch inserts (1000+ rows)
INSERT INTO table VALUES (1,'a'),(2,'b'),...;
-- Filter on partition key
WHERE timestamp >= '2026-01-01'
Resources
Conclusion
ClickHouse excels in analytical scenarios requiring real-time queries on massive datasets. From web analytics to IoT monitoring, ClickHouse provides the performance needed for modern data-driven applications.
With this comprehensive guide, you now have the knowledge to implement ClickHouse effectively in various production scenarios.
Comments