Skip to main content
โšก Calmops

Data Warehouse Optimization: Snowflake, BigQuery, Redshift

Introduction

Data warehouses power critical business decisions. With petabytes of data and thousands of queries daily, optimization isn’t optionalโ€”it’s essential for performance and cost. This guide covers optimizing the three major cloud data warehouses.

Key Statistics:

  • Average query optimization: 10-100x performance improvement
  • Proper clustering reduces query costs by 60-80%
  • Materialized views improve dashboard performance by 90%
  • 70% of queries can be optimized with simple techniques

Snowflake Optimization

Clustering Strategy

-- Create table with clustering key
CREATE TABLE analytics.events (
    event_id STRING,
    event_type STRING,
    user_id STRING,
    event_timestamp TIMESTAMP,
    payload VARIANT,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
CLUSTER BY (event_timestamp, event_type);

-- Analyze cluster quality
SELECT SYSTEM$CLUSTERING_DEPTH('analytics.events', '(event_timestamp, event_type)');

-- Rebuild clustering
ALTER TABLE analytics.events RECLUSTER;

-- Automatic clustering monitoring
SELECT 
    TABLE_NAME,
    CLUSTERING_DEPTH,
    ROWS_RECLUSTERED,
    BYTES_RECLUSTERED
FROM INFORMATION_SCHEMA.TABLE_STORAGE_METRICS
WHERE TABLE_SCHEMA = 'ANALYTICS';

Query Optimization

-- Good: Use WHERE clause to limit data scanned
SELECT *
FROM events
WHERE event_timestamp >= '2026-01-01'
  AND event_timestamp < '2026-02-01'
  AND event_type = 'purchase';

-- Bad: Filter after scanning all data
SELECT *
FROM events
WHERE event_type = 'purchase';

-- Use appropriate data types
-- Good
CREATE TABLE orders (
    order_id STRING,  -- For UUIDs
    amount DECIMAL(10,2),
    quantity INTEGER
);

-- Bad - Using wrong types
CREATE TABLE orders (
    order_id VARCHAR(1000),
    amount VARCHAR(50),
    quantity VARCHAR(10)
);

-- Pruning with proper filter order
EXPLAIN 
SELECT u.name, e.event_type, e.event_timestamp
FROM users u
JOIN events e ON u.user_id = e.user_id
WHERE e.event_timestamp >= '2026-01-01'  -- Partition key first
  AND u.status = 'active';  -- Non-partition key second

Materialized Views

-- Create materialized view for common aggregations
CREATE MATERIALIZED VIEW analytics.daily_revenue_mv
AS
SELECT 
    DATE_TRUNC('day', event_timestamp) AS day,
    COUNT(*) AS transaction_count,
    SUM( payload:amount::DECIMAL(10,2) ) AS total_revenue
FROM events
WHERE event_type = 'purchase'
GROUP BY DATE_TRUNC('day', event_timestamp);

-- Refresh on schedule
ALTER MATERIALIZED VIEW analytics.daily_revenue_mv
REFRESH ON SCHEDULE EVERY 1 DAY;

-- Query using materialized view
SELECT * FROM analytics.daily_revenue_mv
WHERE day >= DATE_TRUNC('day', CURRENT_DATE - 7);

Cost Optimization

-- Use result caching (automatic in Snowflake)
-- Cache hit indicator in query profile

-- Use approximate aggregate functions for large datasets
SELECT 
    APPROX_COUNT_DISTINCT(user_id) AS unique_users,
    APPROX_PERCENTILE(amount, 0.95) AS p95_amount
FROM events;

-- Use time travel efficiently
-- Good: Specify time travel window
SELECT *
FROM events
AT(TIMESTAMP => '2026-01-15 10:00:00');

-- Bad: Default 24-hour time travel on large tables
-- Only use when necessary

-- Use search optimization for frequently queried columns
ALTER TABLE events ADD SEARCH OPTIMIZATION;

BigQuery Optimization

Partitioning

-- Create partitioned table
CREATE TABLE analytics.events (
    event_id STRING,
    event_type STRING,
    user_id STRING,
    event_timestamp TIMESTAMP,
    payload STRING
)
PARTITION BY DATE(event_timestamp)
OPTIONS(
    partition_expiration_days = 365,
    description = "Events table partitioned by date"
);

-- Query only required partitions
SELECT *
FROM analytics.events
WHERE event_timestamp BETWEEN '2026-01-01' AND '2026-01-31';

-- Partition pruning - check in query plan
-- Look for "Stage: Input" with partition filter

Clustering

-- Create clustered table
CREATE TABLE analytics.events_clustered (
    event_id STRING,
    event_type STRING,
    user_id STRING,
    event_timestamp TIMESTAMP,
    payload STRING
)
PARTITION BY DATE(event_timestamp)
CLUSTER BY event_type, user_id
OPTIONS(description = "Clustered by event_type and user_id");

-- Use clustering columns in WHERE clause
SELECT 
    event_type,
    COUNT(*) as event_count
FROM analytics.events_clustered
WHERE event_type IN ('purchase', 'login', 'view')
  AND event_timestamp >= '2026-01-01'
GROUP BY event_type;

Query Optimization

-- Avoid SELECT *
SELECT user_id, event_type, event_timestamp
FROM analytics.events;

-- Use LIMIT appropriately
SELECT *
FROM analytics.events
LIMIT 1000;

-- Use window functions efficiently
-- Good
SELECT 
    user_id,
    event_type,
    event_timestamp,
    LAG(event_timestamp) OVER (PARTITION BY user_id ORDER BY event_timestamp) as prev_event
FROM analytics.events;

-- Use ARRAY_AGG with LIMIT
SELECT 
    user_id,
    ARRAY_AGG(event_type LIMIT 10) as recent_events
FROM analytics.events
GROUP BY user_id;

-- Denormalize for performance
-- Good: Nested/Repeated fields
CREATE TABLE analytics.user_events (
    user_id STRING,
    events ARRAY<STRUCT<
        event_id STRING,
        event_type STRING,
        event_timestamp TIMESTAMP
    >>
);

-- Join optimization
-- Small table first
SELECT /*+ HASH_JOIN */ u.*, e.*
FROM small_users u
JOIN events e ON u.user_id = e.user_id;

Cost Controls

-- Estimate query cost before running
-- In console: Query validator shows bytes processed

-- Set query parameters
-- In query settings: Priority: INTERACTIVE/BATCH

-- Use slots efficiently
-- Monitor slot usage
SELECT 
    job_id,
    total_bytes_processed,
    total_bytes_billed,
    cache_hit
FROM `region-us.INFORMATION_SCHEMA.JOBS`
WHERE creation_time BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 7 DAY) AND CURRENT_TIMESTAMP();

-- Use resumable operations
-- Default for most operations

Redshift Optimization

Distribution Strategy

-- Create table with distribution
CREATE TABLE analytics.events (
    event_id VARCHAR(32) SORTKEY,
    event_type VARCHAR(50),
    user_id VARCHAR(32) distkey,
    event_timestamp TIMESTAMP,
    payload VARCHAR(500)
)
DISTKEY(user_id)
SORTKEY(event_timestamp);

-- Analyze distribution
SELECT 
    table_name,
    size,
    skew_sortkey1,
    skew_sortkey2
FROM svv_table_info
WHERE table_schema = 'analytics';

-- Use AUTO distribution for new tables
CREATE TABLE analytics.events_auto (
    event_id VARCHAR(32),
    event_type VARCHAR(50),
    user_id VARCHAR(32),
    event_timestamp TIMESTAMP
)
AUTO;

Compression

-- Analyze compression
ANALYZE COMPRESSION analytics.events;

-- Create table with compression
CREATE TABLE analytics.events_compressed (
    event_id VARCHAR(32) ENCODE ZSTD,
    event_type VARCHAR(50) ENCODE ZSTD,
    user_id VARCHAR(32) ENCODE ZSTD,
    event_timestamp TIMESTAMP ENCODE AZ64,
    payload VARCHAR(500) ENCODE ZSTD
)
DISTKEY(user_id)
SORTKEY(event_timestamp);

-- Check current compression
SELECT 
    "table",
    column,
    encoding,
    unscompressed_bytes,
    compressed_bytes,
    ROUND(100 * (1 - compressed_bytes / NULLIF(unscompressed_bytes, 0)), 2) as ratio
FROM pg_catalog.stv_columns
WHERE schema = 'analytics';

Query Optimization

-- Use WLM for query priorities
-- Configure WLM in parameter group

-- Create materialized view
CREATE MATERIALIZED VIEW analytics.mv_daily_sales AS
SELECT 
    DATE_TRUNC('day', event_timestamp) AS day,
    event_type,
    COUNT(*) AS count
FROM analytics.events
WHERE event_type = 'purchase'
GROUP BY 1, 2;

-- Refresh materialized view
REFRESH MATERIALIZED VIEW analytics.mv_daily_sales;

-- Use approximate functions
SELECT 
    APPROXIMATE COUNT(DISTINCT user_id),
    APPROXIMATE PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY amount)
FROM analytics.events;

Comparison

Feature Snowflake BigQuery Redshift
Storage Automatic Automatic Manual
Scaling Instant Automatic Manual
Partitioning Automatic Manual Key-based
Clustering Automatic Manual Manual
Caching Query Results Query Results Result Caching
Concurrency Automatic Slot-based WLM

External Resources


Comments