Introduction
While DuckDB is designed for simplicity, optimizing its performance requires understanding its configuration options and operational characteristics. As an embedded analytical database, DuckDB runs in-process with your application, making proper configuration crucial for production workloads.
This guide covers DuckDB operations including configuration, performance tuning, memory management, and production deployment patterns.
Configuration Overview
Settings Categories
DuckDB provides several configuration categories:
-- Check all settings
SELECT * FROM duckdb_settings();
-- Query specific setting
SELECT current_setting('threads');
-- Set session value
SET threads = 8;
-- Set global value
SET GLOBAL threads = 8;
Important Settings
-- Threads (parallelism)
SET threads = 8; -- Default: number of CPU cores
-- Memory
SET memory_limit = '8GB'; -- Default: 80% of available RAM
-- Temp directory
SET temp_directory = '/tmp/duckdb';
-- Checkpoint settings
SET checkpoint_threshold = '16GB';
Memory Management
Configuring Memory
import duckdb
# Connect with memory limit
con = duckdb.connect('analytics.db', config={
'memory_limit': '8GB',
'threads': 8
})
# Check memory usage
result = con.execute("""
SELECT
current_setting('memory_limit') as limit,
(SELECT SUM(memory_usage) FROM duckdb_memory()) as used
""").fetchall()
print(result)
# Monitor memory during query
con.execute("SET enable_profiling = 'json'")
con.execute("SELECT * FROM huge_table")
Memory-Mapped I/O
-- Enable memory-mapped I/O for large datasets
SET enable_mmap = true;
-- Configure memory-mapped I/O size
SET mmap_size = '4GB';
Query Optimization
Execution Plans
-- Explain query plan
EXPLAIN SELECT * FROM employees WHERE department = 'Engineering';
-- Analyze query (detailed)
EXPLAIN ANALYZE SELECT department, AVG(salary) FROM employees GROUP BY department;
-- Profile query
SET enable_profiling = 'json';
SELECT * FROM employees;
-- Check profiling output
SELECT * FROM duckdb_profiling();
Indexes
-- Create index on frequently queried columns
CREATE INDEX idx_employees_department ON employees(department);
CREATE INDEX idx_employees_salary ON employees(salary);
-- Check index usage
EXPLAIN SELECT * FROM employees WHERE department = 'Sales';
-- Drop index
DROP INDEX idx_employees_department;
Query Hints
-- Force specific join order
SELECT * FROM employees e
JOIN departments d ON e.dept_id = d.id
JOIN projects p ON e.project_id = p.id;
-- Disable parallelism for debugging
SET threads = 1;
-- Force specific scan type
SELECT * FROM employees USING SCAN employees_idx;
Parallelism
Thread Configuration
-- Set number of threads
SET threads = 8;
-- Check current thread usage
SELECT current_setting('threads');
-- Query thread usage
SELECT * FROM duckdb_threads();
Parallel I/O
-- Enable parallel CSV reading
SET parallel_csv = true;
-- Configure vectorized collection size
SET batch_size = 1024;
Storage and Checkpointing
Database Files
-- Check database size
SELECT * FROM duckdb_storage_info();
-- Vacuum to reclaim space
CHECKPOINT;
VACUUM;
-- Optimize storage
OPTIMIZE TABLE employees;
WAL Mode
-- Enable write-ahead logging
SET wal_mode = true;
-- Configure WAL size limit
SET wal_size_limit = '4GB';
Backup and Recovery
Database Backup
# Backup: simply copy the database file
cp analytics.db analytics_backup.db
# Backup with compression
duckdb -cmd ".backup analytics.db" | gzip > backup_$(date +%Y%m%d).db.gz
Python Backup Script
import shutil
import gzip
from datetime import datetime
import os
class DuckDBBackup:
def __init__(self, db_path):
self.db_path = db_path
def create_backup(self, backup_dir='/tmp/backups'):
"""Create compressed backup."""
os.makedirs(backup_dir, exist_ok=True)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
backup_path = os.path.join(backup_dir, f'duckdb_{timestamp}.db')
# Copy database file
shutil.copy2(self.db_path, backup_path)
# Compress
compressed = f'{backup_path}.gz'
with open(backup_path, 'rb') as f_in:
with gzip.open(compressed, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
os.remove(backup_path)
return compressed
def restore_backup(self, backup_file, target_path=None):
"""Restore from backup."""
if target_path is None:
target_path = self.db_path
# Decompress if needed
if backup_file.endswith('.gz'):
with gzip.open(backup_file, 'rb') as f_in:
with open(target_path, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
else:
shutil.copy2(backup_file, target_path)
# Usage
backup = DuckDBBackup('analytics.db')
backup.create_backup()
Export/Import
-- Export specific tables
EXPORT DATABASE 'backup_directory' (
FORMAT CSV,
DELIMITER ',',
HEADER true
);
-- Import database
IMPORT DATABASE 'backup_directory';
Performance Monitoring
Query Profiling
-- Enable profiling
SET enable_profiling = 'query_tree';
-- Run query
SELECT * FROM large_table GROUP BY category;
-- View profiling info
SELECT * FROM duckdb_profiling();
-- Detailed JSON profiling
SET enable_profiling = 'json';
SELECT * FROM orders WHERE date > '2025-01-01';
-- Check profiling output
Statistics
-- Table statistics
PRAGMA table_info('employees');
-- Index information
PRAGMA index_list('employees');
-- Query statistics
SELECT * FROM duckdb_statistics();
Production Patterns
Connection Pooling
import duckdb
import threading
from queue import Queue
class DuckDBPool:
"""Simple connection pool for DuckDB."""
def __init__(self, db_path, pool_size=5):
self.db_path = db_path
self.pool = Queue(maxsize=pool_size)
# Pre-create connections
for _ in range(pool_size):
conn = duckdb.connect(db_path)
self.pool.put(conn)
def get_connection(self, timeout=30):
"""Get connection from pool."""
return self.pool.get(timeout=timeout)
def return_connection(self, conn):
"""Return connection to pool."""
self.pool.put(conn)
def close_all(self):
"""Close all connections."""
while not self.pool.empty():
conn = self.pool.get()
conn.close()
# Usage
pool = DuckDBPool('analytics.db')
conn = pool.get_connection()
result = conn.execute("SELECT * FROM users").df()
pool.return_connection(conn)
Background Processing
import duckdb
import threading
import queue
class DuckDBWorker:
"""Background query processor."""
def __init__(self, db_path):
self.db_path = db_path
self.query_queue = queue.Queue()
self.result_queue = queue.Queue()
self.running = True
self.thread = threading.Thread(target=self._process_queries)
self.thread.start()
def _process_queries(self):
"""Process queries in background."""
conn = duckdb.connect(self.db_path)
while self.running:
try:
query, args, callback = self.query_queue.get(timeout=1)
try:
result = conn.execute(query, args or [])
if callback:
callback(result.fetchall())
else:
self.result_queue.put(result.fetchall())
except Exception as e:
self.result_queue.put(e)
except queue.Empty:
continue
conn.close()
def execute(self, query, args=None, callback=None):
"""Submit query for background execution."""
self.query_queue.put((query, args, callback))
def get_result(self, timeout=30):
"""Get result from last query."""
return self.result_queue.get(timeout=timeout)
def stop(self):
"""Stop background worker."""
self.running = False
self.thread.join()
# Usage
worker = DuckDBWorker('analytics.db')
worker.execute("SELECT COUNT(*) FROM events")
result = worker.get_result()
worker.stop()
Application Integration
Flask Integration
from flask import Flask, request, jsonify
import duckdb
app = Flask(__name__)
def get_db():
"""Get database connection."""
return duckdb.connect('analytics.db', read_only=True)
@app.route('/api/analytics')
def analytics():
con = get_db()
try:
# Get summary stats
result = con.execute("""
SELECT
COUNT(*) as total_events,
COUNT(DISTINCT user_id) as unique_users,
AVG(duration) as avg_duration
FROM events
WHERE created_at > CURRENT_DATE - INTERVAL '30 days'
""").fetchone()
return jsonify({
'total_events': result[0],
'unique_users': result[1],
'avg_duration': result[2]
})
finally:
con.close()
@app.route('/api/users/<int:user_id>/activity')
def user_activity(user_id):
con = get_db()
try:
df = con.execute("""
SELECT date, action, duration
FROM user_activity
WHERE user_id = ?
ORDER BY date DESC
LIMIT 100
""", [user_id]).df()
return jsonify(df.to_dict(orient='records'))
finally:
con.close()
Streamlit Integration
import streamlit as st
import duckdb
# Connect to DuckDB
@st.cache_resource
def get_connection():
return duckdb.connect('analytics.db')
conn = get_connection()
# Title
st.title("Analytics Dashboard")
# KPI metrics
col1, col2, col3 = st.columns(3)
with col1:
total = conn.execute("SELECT COUNT(*) FROM orders").fetchone()[0]
st.metric("Total Orders", f"{total:,}")
with col2:
revenue = conn.execute("SELECT SUM(amount) FROM orders").fetchone()[0]
st.metric("Total Revenue", f"${revenue:,.2f}")
with col3:
avg_order = conn.execute("SELECT AVG(amount) FROM orders").fetchone()[0]
st.metric("Avg Order Value", f"${avg_order:,.2f}")
# Chart
st.subheader("Orders by Month")
df = conn.execute("""
SELECT
STRFTIME('%Y-%m', order_date) as month,
COUNT(*) as orders,
SUM(amount) as revenue
FROM orders
GROUP BY month
ORDER BY month
""").df()
st.line_chart(df.set_index('month')['revenue'])
# Data table
st.subheader("Recent Orders")
recent = conn.execute("""
SELECT order_id, customer, amount, order_date
FROM orders
ORDER BY order_date DESC
LIMIT 100
""").df()
st.dataframe(recent)
Best Practices
Configuration Checklist
# Production DuckDB configuration
config = {
'memory_limit': '8GB', # Adjust based on available RAM
'threads': 8, # Number of CPU cores
'checkpoint_threshold': '16GB', # Checkpoint frequency
'enable_profiling': 'query_tree', # For debugging
'wal_mode': True, # Enable WAL for durability
}
Operational Guidelines
-- Regular checkpointing
CHECKPOINT;
-- Monitor storage
SELECT * FROM duckdb_storage_info();
-- Clean up temporary files
-- DuckDB automatically cleans temp files
Common Pitfalls
Pitfall 1: Insufficient Memory
# Bad: Default memory might be too small
con = duckdb.connect('data.db')
# Good: Set appropriate memory limit
con = duckdb.connect('data.db', config={'memory_limit': '16GB'})
Pitfall 2: Not Using Parallelism
-- Default should use all cores, but verify
SELECT current_setting('threads');
-- Increase for heavy workloads
SET threads = 16;
Pitfall 3: Large Results in Memory
# Bad: Load everything into memory
results = con.execute("SELECT * FROM huge_table").fetchall()
# Good: Use pandas integration
df = con.execute("SELECT * FROM huge_table").df()
Resources
Conclusion
DuckDB’s operational simplicity makes it an excellent choice for production analytical workloads. By understanding its configuration options and applying proper memory and parallelism settings, you can achieve excellent performance.
In the next article, we’ll explore DuckDB’s internal architecture, including vectorized execution, columnar storage, and query processing.
Comments