Introduction
Data warehouses power critical business decisions. With petabytes of data and thousands of queries daily, optimization isn’t optionalโit’s essential for performance and cost. This guide covers optimizing the three major cloud data warehouses.
Key Statistics:
- Average query optimization: 10-100x performance improvement
- Proper clustering reduces query costs by 60-80%
- Materialized views improve dashboard performance by 90%
- 70% of queries can be optimized with simple techniques
Snowflake Optimization
Clustering Strategy
-- Create table with clustering key
CREATE TABLE analytics.events (
event_id STRING,
event_type STRING,
user_id STRING,
event_timestamp TIMESTAMP,
payload VARIANT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
CLUSTER BY (event_timestamp, event_type);
-- Analyze cluster quality
SELECT SYSTEM$CLUSTERING_DEPTH('analytics.events', '(event_timestamp, event_type)');
-- Rebuild clustering
ALTER TABLE analytics.events RECLUSTER;
-- Automatic clustering monitoring
SELECT
TABLE_NAME,
CLUSTERING_DEPTH,
ROWS_RECLUSTERED,
BYTES_RECLUSTERED
FROM INFORMATION_SCHEMA.TABLE_STORAGE_METRICS
WHERE TABLE_SCHEMA = 'ANALYTICS';
Query Optimization
-- Good: Use WHERE clause to limit data scanned
SELECT *
FROM events
WHERE event_timestamp >= '2026-01-01'
AND event_timestamp < '2026-02-01'
AND event_type = 'purchase';
-- Bad: Filter after scanning all data
SELECT *
FROM events
WHERE event_type = 'purchase';
-- Use appropriate data types
-- Good
CREATE TABLE orders (
order_id STRING, -- For UUIDs
amount DECIMAL(10,2),
quantity INTEGER
);
-- Bad - Using wrong types
CREATE TABLE orders (
order_id VARCHAR(1000),
amount VARCHAR(50),
quantity VARCHAR(10)
);
-- Pruning with proper filter order
EXPLAIN
SELECT u.name, e.event_type, e.event_timestamp
FROM users u
JOIN events e ON u.user_id = e.user_id
WHERE e.event_timestamp >= '2026-01-01' -- Partition key first
AND u.status = 'active'; -- Non-partition key second
Materialized Views
-- Create materialized view for common aggregations
CREATE MATERIALIZED VIEW analytics.daily_revenue_mv
AS
SELECT
DATE_TRUNC('day', event_timestamp) AS day,
COUNT(*) AS transaction_count,
SUM( payload:amount::DECIMAL(10,2) ) AS total_revenue
FROM events
WHERE event_type = 'purchase'
GROUP BY DATE_TRUNC('day', event_timestamp);
-- Refresh on schedule
ALTER MATERIALIZED VIEW analytics.daily_revenue_mv
REFRESH ON SCHEDULE EVERY 1 DAY;
-- Query using materialized view
SELECT * FROM analytics.daily_revenue_mv
WHERE day >= DATE_TRUNC('day', CURRENT_DATE - 7);
Cost Optimization
-- Use result caching (automatic in Snowflake)
-- Cache hit indicator in query profile
-- Use approximate aggregate functions for large datasets
SELECT
APPROX_COUNT_DISTINCT(user_id) AS unique_users,
APPROX_PERCENTILE(amount, 0.95) AS p95_amount
FROM events;
-- Use time travel efficiently
-- Good: Specify time travel window
SELECT *
FROM events
AT(TIMESTAMP => '2026-01-15 10:00:00');
-- Bad: Default 24-hour time travel on large tables
-- Only use when necessary
-- Use search optimization for frequently queried columns
ALTER TABLE events ADD SEARCH OPTIMIZATION;
BigQuery Optimization
Partitioning
-- Create partitioned table
CREATE TABLE analytics.events (
event_id STRING,
event_type STRING,
user_id STRING,
event_timestamp TIMESTAMP,
payload STRING
)
PARTITION BY DATE(event_timestamp)
OPTIONS(
partition_expiration_days = 365,
description = "Events table partitioned by date"
);
-- Query only required partitions
SELECT *
FROM analytics.events
WHERE event_timestamp BETWEEN '2026-01-01' AND '2026-01-31';
-- Partition pruning - check in query plan
-- Look for "Stage: Input" with partition filter
Clustering
-- Create clustered table
CREATE TABLE analytics.events_clustered (
event_id STRING,
event_type STRING,
user_id STRING,
event_timestamp TIMESTAMP,
payload STRING
)
PARTITION BY DATE(event_timestamp)
CLUSTER BY event_type, user_id
OPTIONS(description = "Clustered by event_type and user_id");
-- Use clustering columns in WHERE clause
SELECT
event_type,
COUNT(*) as event_count
FROM analytics.events_clustered
WHERE event_type IN ('purchase', 'login', 'view')
AND event_timestamp >= '2026-01-01'
GROUP BY event_type;
Query Optimization
-- Avoid SELECT *
SELECT user_id, event_type, event_timestamp
FROM analytics.events;
-- Use LIMIT appropriately
SELECT *
FROM analytics.events
LIMIT 1000;
-- Use window functions efficiently
-- Good
SELECT
user_id,
event_type,
event_timestamp,
LAG(event_timestamp) OVER (PARTITION BY user_id ORDER BY event_timestamp) as prev_event
FROM analytics.events;
-- Use ARRAY_AGG with LIMIT
SELECT
user_id,
ARRAY_AGG(event_type LIMIT 10) as recent_events
FROM analytics.events
GROUP BY user_id;
-- Denormalize for performance
-- Good: Nested/Repeated fields
CREATE TABLE analytics.user_events (
user_id STRING,
events ARRAY<STRUCT<
event_id STRING,
event_type STRING,
event_timestamp TIMESTAMP
>>
);
-- Join optimization
-- Small table first
SELECT /*+ HASH_JOIN */ u.*, e.*
FROM small_users u
JOIN events e ON u.user_id = e.user_id;
Cost Controls
-- Estimate query cost before running
-- In console: Query validator shows bytes processed
-- Set query parameters
-- In query settings: Priority: INTERACTIVE/BATCH
-- Use slots efficiently
-- Monitor slot usage
SELECT
job_id,
total_bytes_processed,
total_bytes_billed,
cache_hit
FROM `region-us.INFORMATION_SCHEMA.JOBS`
WHERE creation_time BETWEEN TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 7 DAY) AND CURRENT_TIMESTAMP();
-- Use resumable operations
-- Default for most operations
Redshift Optimization
Distribution Strategy
-- Create table with distribution
CREATE TABLE analytics.events (
event_id VARCHAR(32) SORTKEY,
event_type VARCHAR(50),
user_id VARCHAR(32) distkey,
event_timestamp TIMESTAMP,
payload VARCHAR(500)
)
DISTKEY(user_id)
SORTKEY(event_timestamp);
-- Analyze distribution
SELECT
table_name,
size,
skew_sortkey1,
skew_sortkey2
FROM svv_table_info
WHERE table_schema = 'analytics';
-- Use AUTO distribution for new tables
CREATE TABLE analytics.events_auto (
event_id VARCHAR(32),
event_type VARCHAR(50),
user_id VARCHAR(32),
event_timestamp TIMESTAMP
)
AUTO;
Compression
-- Analyze compression
ANALYZE COMPRESSION analytics.events;
-- Create table with compression
CREATE TABLE analytics.events_compressed (
event_id VARCHAR(32) ENCODE ZSTD,
event_type VARCHAR(50) ENCODE ZSTD,
user_id VARCHAR(32) ENCODE ZSTD,
event_timestamp TIMESTAMP ENCODE AZ64,
payload VARCHAR(500) ENCODE ZSTD
)
DISTKEY(user_id)
SORTKEY(event_timestamp);
-- Check current compression
SELECT
"table",
column,
encoding,
unscompressed_bytes,
compressed_bytes,
ROUND(100 * (1 - compressed_bytes / NULLIF(unscompressed_bytes, 0)), 2) as ratio
FROM pg_catalog.stv_columns
WHERE schema = 'analytics';
Query Optimization
-- Use WLM for query priorities
-- Configure WLM in parameter group
-- Create materialized view
CREATE MATERIALIZED VIEW analytics.mv_daily_sales AS
SELECT
DATE_TRUNC('day', event_timestamp) AS day,
event_type,
COUNT(*) AS count
FROM analytics.events
WHERE event_type = 'purchase'
GROUP BY 1, 2;
-- Refresh materialized view
REFRESH MATERIALIZED VIEW analytics.mv_daily_sales;
-- Use approximate functions
SELECT
APPROXIMATE COUNT(DISTINCT user_id),
APPROXIMATE PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY amount)
FROM analytics.events;
Comparison
| Feature | Snowflake | BigQuery | Redshift |
|---|---|---|---|
| Storage | Automatic | Automatic | Manual |
| Scaling | Instant | Automatic | Manual |
| Partitioning | Automatic | Manual | Key-based |
| Clustering | Automatic | Manual | Manual |
| Caching | Query Results | Query Results | Result Caching |
| Concurrency | Automatic | Slot-based | WLM |
Comments