ClickHouse: The Complete Guide to Columnar Analytics Database

Introduction

ClickHouse is an open-source column-oriented database management system that enables real-time analytical data processing. Known for its exceptional query performance on massive datasets, ClickHouse powers analytics at major companies like Cloudflare, Spotify, and eBay.

In 2026, ClickHouse continues to lead the OLAP space with new features including vector similarity search, improved AI integrations, and cloud-native deployments. This comprehensive guide covers ClickHouse fundamentals for building high-performance analytical systems.

What is ClickHouse?

ClickHouse is a column-oriented DBMS (Database Management System) designed for Online Analytical Processing (OLAP). It excels at processing complex queries on petabyte-scale data with sub-second latency.

Key Characteristics

Columnar Storage: Data stored by columns, not rows
Vectorized Execution: Processes data in batches using SIMD
Distributed Architecture: Scale horizontally across multiple nodes
Compression: Highly efficient data compression
Real-time Ingestion: Supports high-throughput data loading
SQL Support: Familiar SQL dialect with extensions
MPP Architecture: Massively Parallel Processing

ClickHouse vs Other Databases

Aspect	ClickHouse	PostgreSQL	Elasticsearch
Type	Columnar OLAP	Row-based OLTP	Search engine
Query Speed	Milliseconds	Seconds	Seconds
Data Size	PB scale	TB scale	PB scale
Joins	Limited	Full	Limited
SQL Support	Extensive	Full	Limited

ClickHouse Data Types

Numeric Types

-- Integer types
CREATE TABLE numbers (
    tiny UInt8,           -- 0 to 255
    small UInt16,         -- 0 to 65535
    medium UInt32,         -- 0 to 4294967295
    Int32,                 -- -2147483648 to 2147483647
    Int64,                 -- Large integers
    Int128, Int256         -- Very large integers
);

-- Floating-point types
CREATE TABLE measurements (
    id UInt64,
    temperature Float32,   -- Single precision
    pressure Float64       -- Double precision
);

-- Note: Float operations can have precision issues
SELECT 1 - 0.9;  -- Returns 0.09999999999999998

-- Use Decimal for precision
CREATE TABLE financials (
    amount Decimal(10, 2),  -- 10 digits, 2 decimal places
    rate Decimal(5, 4)
);

String Types

-- String types
CREATE TABLE text_data (
    fixed String(10),      -- Fixed length (deprecated)
    variable String,       -- Variable length
    fixed_utf8 FixedString(10)  -- Fixed UTF-8
);

-- Special strings
CREATE TABLE logs (
    message String,
    payload String
);

Date and Time Types

-- Date types
CREATE TABLE events (
    event_date Date,           -- 'YYYY-MM-DD'
    event_datetime DateTime,  -- 'YYYY-MM-DD HH:MM:SS'
    event_datetime64 DateTime64(3),  -- With precision
    event_date32 Date32       -- Extended range
);

-- Insert date values
INSERT INTO events VALUES ('2026-03-05', '2026-03-05 14:30:00', now(), toDateTime64('2026-03-05 14:30:00.123', 3));

Array Types

-- Array type
CREATE TABLE arrays (
    id UInt64,
    numbers Array(Int32),
    strings Array(String),
    nested Array(Array(String))
);

INSERT INTO arrays VALUES 
    (1, [1, 2, 3], ['a', 'b'], [['x', 'y']]);

-- Array functions
SELECT arrayCount(x -> x > 0, [1, -2, 3]);  -- 2
SELECT arraySum([1, 2, 3]);  -- 6
SELECT arrayFilter(x -> x > 0, [1, -2, 3]);  -- [1, 3]

Tuple and Nested Types

-- Tuple type
CREATE TABLE tuples (
    id UInt64,
    point Tuple(x Float64, y Float64),
    person Tuple(name String, age UInt8)
);

INSERT INTO tuples VALUES 
    (1, (10.5, 20.5), ('John', 30));

-- Access tuple elements
SELECT person.1 FROM tuples;  -- John
SELECT point.2 FROM tuples;    -- 20.5

-- Nested type (similar to JSON)
CREATE TABLE logs (
    id UInt64,
    user_name String,
    actions Nested(
        action String,
        timestamp DateTime
    )
);

INSERT INTO logs VALUES (1, 'john', [('login', '2026-01-01 10:00'), ('purchase', '2026-01-01 10:05')]);

Nullable and LowCardinality

-- Nullable type
CREATE TABLE with_nulls (
    id UInt64,
    name String,
    optional_value Nullable(String)
);

INSERT INTO with_nulls VALUES (1, 'John', NULL);
INSERT INTO with_nulls VALUES (2, 'Jane', 'value');

-- LowCardinality (optimized string storage)
CREATE TABLE optimized (
    id UInt64,
    category LowCardinality(String),
    status LowCardinality(String)
);

ClickHouse Installation

Docker Installation

# Single node ClickHouse
docker run -d \
    --name clickhouse-server \
    --ulimit nofile=262144 \
    -p 8123:8123 \
    -p 9000:9000 \
    clickhouse/clickhouse-server

# With volume for persistence
docker run -d \
    --name clickhouse-server \
    -v clickhouse_data:/var/lib/clickhouse \
    -p 8123:8123 \
    clickhouse/clickhouse-server

Linux Installation

# Ubuntu/Debian
sudo apt-get install -y apt-transport-https ca-certificates dirmngr
sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 8919F6BD2B48D754
echo "deb https://packages.clickhouse.com/deb stable main" | sudo tee /etc/apt/sources.list.d/clickhouse.list
sudo apt-get update
sudo apt-get install -y clickhouse-server clickhouse-client

# Start service
sudo service clickhouse-server start

# Or use clickhouse-client
clickhouse-client

Python Integration

pip install clickhouse-connect

import clickhouse_connect

# Connect to ClickHouse
client = clickhouse_connect.get_client(
    host='localhost',
    port=8123,
    username='default',
    password=''
)

# Create table
client.command("""
    CREATE TABLE IF NOT EXISTS events (
        id UInt64,
        name String,
        timestamp DateTime,
        value Float64
    ) ENGINE = MergeTree()
    ORDER BY (name, timestamp)
""")

# Insert data
client.insert('events', [
    [1, 'event1', '2026-03-05 10:00:00', 100.5],
    [2, 'event2', '2026-03-05 11:00:00', 200.5]
], column_names=['id', 'name', 'timestamp', 'value'])

# Query data
result = client.query("""
    SELECT name, count() as cnt, avg(value) as avg_value
    FROM events
    GROUP BY name
    ORDER BY cnt DESC
""")

print(result.result_rows)

# Close connection
client.close()

Basic SQL Operations

Creating Tables

-- Basic MergeTree table
CREATE TABLE events (
    id UInt64,
    event_name String,
    user_id UInt32,
    timestamp DateTime,
    value Float64
) ENGINE = MergeTree()
ORDER BY (event_name, timestamp)
PARTITION BY toYYYYMM(timestamp)
TTL timestamp + INTERVAL 3 MONTH;

-- Table with sampling
CREATE TABLE sampled_events (
    id UInt64,
    event_name String,
    user_id UInt32,
    timestamp DateTime
) ENGINE = MergeTree()
ORDER BY (event_name, timestamp)
SAMPLE BY user_id;

-- Replicated table
CREATE TABLE replicated_events (
    id UInt64,
    name String,
    value Float64
) ENGINE = ReplicatedMergeTree(
    '/clickhouse/tables/{shard}/events',
    '{replica}',
    ORDER BY id
);

INSERT Operations

-- Insert single row
INSERT INTO events (id, event_name, user_id, timestamp, value) 
VALUES (1, 'click', 100, '2026-03-05 10:00:00', 10.5);

-- Insert multiple rows
INSERT INTO events VALUES 
    (2, 'view', 101, '2026-03-05 10:05:00', 5.5),
    (3, 'purchase', 102, '2026-03-05 10:10:00', 99.99);

-- Insert from SELECT
INSERT INTO events (id, event_name, user_id, timestamp, value)
SELECT 
    generateUUID4() as id,
    'event' as event_name,
    number as user_id,
    now() as timestamp,
    rand() % 100 as value
FROM numbers(1000000);

SELECT Queries

-- Basic SELECT
SELECT * FROM events LIMIT 10;

-- Aggregation
SELECT 
    event_name,
    count() as event_count,
    uniq(user_id) as unique_users,
    avg(value) as avg_value,
    sum(value) as total_value
FROM events
WHERE timestamp >= '2026-01-01'
GROUP BY event_name
ORDER BY event_count DESC
LIMIT 10;

-- Time series aggregation
SELECT 
    toStartOfHour(timestamp) as hour,
    event_name,
    count() as events
FROM events
WHERE timestamp >= now() - INTERVAL 7 DAY
GROUP BY hour, event_name
ORDER BY hour;

JOIN Operations

-- Create tables for join example
CREATE TABLE users (id UInt32, name String, country String) ENGINE = TinyLog;
CREATE TABLE orders (id UInt64, user_id UInt32, amount Float64) ENGINE = TinyLog;

INSERT INTO users VALUES (1, 'John', 'US'), (2, 'Jane', 'UK'), (3, 'Bob', 'US');
INSERT INTO orders VALUES (1, 1, 100), (2, 1, 200), (3, 2, 150);

-- LEFT JOIN
SELECT u.name, o.amount
FROM users u
LEFT JOIN orders o ON u.id = o.user_id;

-- GLOBAL JOIN (broadcasts to all nodes)
SELECT u.name, o.amount
FROM users u
GLOBAL LEFT JOIN orders o ON u.id = o.user_id;

Table Engines

MergeTree Family

-- Standard MergeTree
CREATE TABLE base_table (
    id UInt64,
    name String,
    created_at DateTime
) ENGINE = MergeTree()
ORDER BY id
PRAMPLE BY id
SETTINGS index_granularity = 8192;

-- ReplacingMergeTree (deduplication)
CREATE TABLE events_replaced (
    id UInt64,
    version UInt8,
    data String
) ENGINE = ReplacingMergeTree(version)
ORDER BY id;

-- SummingMergeTree (auto-aggregation)
CREATE TABLE metrics (
    counter String,
    value Int64,
    created_at DateTime
) ENGINE = SummingMergeTree()
ORDER BY (counter, toDate(created_at));

-- AggregatingMergeTree
CREATE TABLE aggregations (
    counter String,
    value AggregateFunction(sum, Int64)
) ENGINE = AggregatingMergeTree()
ORDER BY counter;

Special Engines

-- Log family (for small tables)
CREATE TABLE log_table (
    id UInt64,
    data String
) ENGINE = Log;

-- Memory (in-memory, for small data)
CREATE TABLE memory_table (
    id UInt64,
    data String
) ENGINE = Memory;

-- Distributed (sharding)
CREATE TABLE distributed_events (
    id UInt64,
    event_name String
) ENGINE = Distributed(
    cluster_name,
    database,
    base_table,
    sipHash64(user_id)
);

Integration Engines

-- MySQL engine
CREATE TABLE mysql_table (
    id Int32,
    name String
) ENGINE = MySQL('localhost:3306', 'database', 'table', 'user', 'password');

-- Kafka engine
CREATE TABLE kafka_events (
    event_name String,
    value String
) ENGINE = Kafka()
SETTINGS 
    kafka_broker_list = 'localhost:9092',
    kafka_topic_name = 'events',
    kafka_group_name = 'consumer_group',
    kafka_format = 'JSONEachRow';

-- File engine
CREATE TABLE file_events (
    event String
) ENGINE = File(TSV, '/path/to/events.tsv');

Advanced Queries

Window Functions

-- Running totals
SELECT 
    id,
    value,
    sum(value) OVER (ORDER BY id) as running_total
FROM numbers(10);

-- Moving average
SELECT 
    toStartOfDay(timestamp) as day,
    value,
    avg(value) OVER (ORDER BY toStartOfDay(timestamp) ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) as moving_avg_7d
FROM metrics;

-- Rank functions
SELECT 
    name,
    value,
    rank() OVER (ORDER BY value DESC) as rank,
    dense_rank() OVER (ORDER BY value DESC) as dense_rank,
    percent_rank() OVER (ORDER BY value DESC) as percent_rank
FROM metrics;

GROUP BY Extensions

-- GROUP BY WITH TOTALS
SELECT 
    event_name,
    count() as cnt
FROM events
GROUP BY event_name WITH TOTALS;

-- GROUPING SETS
SELECT 
    event_name,
    country,
    count() as cnt
FROM events
GROUP BY 
    GROUPING SETS (
        (event_name, country),
        (event_name),
        ()
    );

-- WITH ROLLUP
SELECT 
    event_name,
    toDate(timestamp) as date,
    count()
FROM events
GROUP BY event_name, date WITH ROLLUP;

-- WITH CUBE
SELECT 
    event_name,
    country,
    count()
FROM events
GROUP BY event_name, country WITH CUBE;

Array and Tuple Operations

-- Array operations
SELECT 
    arrayJoin([1, 2, 3]) as num;

-- Group by array
SELECT 
    ids,
    count() as cnt
FROM (SELECT [1, 2] as ids GROUP BY ids);

-- Array as aggregation result
SELECT 
    event_name,
    groupArray(user_id) as users
FROM events
GROUP BY event_name;

-- Nested array processing
SELECT 
    arrayJoin(nested_column.key) as key
FROM nested_table;

Best Practices

Table Design

-- Primary key is for sorting, not unique identification
-- Order by columns that are commonly filtered and sorted

-- Partition by time for TTL and efficient queries
PARTITION BY toYYYYMM(timestamp)

-- Use appropriate data types
-- UInt over Int when possible (more efficient)
-- Use LowCardinality for categorical strings

-- Index granularity: 8192 default for large tables
-- Lower for tables with many point queries

Query Optimization

-- Use WHERE on partition key
-- Good
SELECT * FROM events WHERE timestamp >= '2026-01-01'

-- Bad (full scan)
SELECT * FROM events WHERE value > 100

-- Use projection for common query patterns
ALTER TABLE events ADD PROJECTION (event_name, toDate(timestamp));

-- Limit columns in SELECT
-- Good
SELECT event_name, count() FROM events GROUP BY event_name

-- Bad
SELECT * FROM events

Common Pitfalls

Pitfall 1: Too Many Parts

-- Too many small inserts create many parts
-- Solution: Batch inserts
INSERT INTO events VALUES (1, 'a', 1), (2, 'b', 2);

-- Optimize merge
OPTIMIZE TABLE events FINAL;

Pitfall 2: Wrong Data Types

-- Don't use Float for money
-- Bad
CREATE TABLE bad (amount Float64);

-- Good
CREATE TABLE good (amount Decimal(10, 2));

Pitfall 3: Missing Partition Key

-- Always partition time-based data
CREATE TABLE good (
    timestamp DateTime
) ENGINE = MergeTree()
PARTITION BY toYYYYMM(timestamp)  -- Always partition by time
ORDER BY timestamp;

Resources

Conclusion

ClickHouse provides exceptional performance for analytical workloads through its columnar storage and vectorized execution. Understanding its unique architecture and table engines is key to building high-performance analytics systems.

In the next article, we’ll explore ClickHouse operations including configuration, replication, and production deployment.