DuckDB Operations: Performance Tuning, Configuration, and Production Use

Introduction

While DuckDB is designed for simplicity, optimizing its performance requires understanding its configuration options and operational characteristics. As an embedded analytical database, DuckDB runs in-process with your application, making proper configuration crucial for production workloads.

This guide covers DuckDB operations including configuration, performance tuning, memory management, and production deployment patterns.

Configuration Overview

Settings Categories

DuckDB provides several configuration categories:

-- Check all settings
SELECT * FROM duckdb_settings();

-- Query specific setting
SELECT current_setting('threads');

-- Set session value
SET threads = 8;

-- Set global value
SET GLOBAL threads = 8;

Important Settings

-- Threads (parallelism)
SET threads = 8;  -- Default: number of CPU cores

-- Memory
SET memory_limit = '8GB';  -- Default: 80% of available RAM

-- Temp directory
SET temp_directory = '/tmp/duckdb';

-- Checkpoint settings
SET checkpoint_threshold = '16GB';

Memory Management

Configuring Memory

import duckdb

# Connect with memory limit
con = duckdb.connect('analytics.db', config={
    'memory_limit': '8GB',
    'threads': 8
})

# Check memory usage
result = con.execute("""
    SELECT 
        current_setting('memory_limit') as limit,
        (SELECT SUM(memory_usage) FROM duckdb_memory()) as used
""").fetchall()
print(result)

# Monitor memory during query
con.execute("SET enable_profiling = 'json'")
con.execute("SELECT * FROM huge_table")

Memory-Mapped I/O

-- Enable memory-mapped I/O for large datasets
SET enable_mmap = true;

-- Configure memory-mapped I/O size
SET mmap_size = '4GB';

Query Optimization

Execution Plans

-- Explain query plan
EXPLAIN SELECT * FROM employees WHERE department = 'Engineering';

-- Analyze query (detailed)
EXPLAIN ANALYZE SELECT department, AVG(salary) FROM employees GROUP BY department;

-- Profile query
SET enable_profiling = 'json';
SELECT * FROM employees;
-- Check profiling output
SELECT * FROM duckdb_profiling();

Indexes

-- Create index on frequently queried columns
CREATE INDEX idx_employees_department ON employees(department);
CREATE INDEX idx_employees_salary ON employees(salary);

-- Check index usage
EXPLAIN SELECT * FROM employees WHERE department = 'Sales';

-- Drop index
DROP INDEX idx_employees_department;

Query Hints

-- Force specific join order
SELECT * FROM employees e 
JOIN departments d ON e.dept_id = d.id
JOIN projects p ON e.project_id = p.id;

-- Disable parallelism for debugging
SET threads = 1;

-- Force specific scan type
SELECT * FROM employees USING SCAN employees_idx;

Parallelism

Thread Configuration

-- Set number of threads
SET threads = 8;

-- Check current thread usage
SELECT current_setting('threads');

-- Query thread usage
SELECT * FROM duckdb_threads();

Parallel I/O

-- Enable parallel CSV reading
SET parallel_csv = true;

-- Configure vectorized collection size
SET batch_size = 1024;

Storage and Checkpointing

Database Files

-- Check database size
SELECT * FROM duckdb_storage_info();

-- Vacuum to reclaim space
CHECKPOINT;
VACUUM;

-- Optimize storage
OPTIMIZE TABLE employees;

WAL Mode

-- Enable write-ahead logging
SET wal_mode = true;

-- Configure WAL size limit
SET wal_size_limit = '4GB';

Backup and Recovery

Database Backup

# Backup: simply copy the database file
cp analytics.db analytics_backup.db

# Backup with compression
duckdb -cmd ".backup analytics.db" | gzip > backup_$(date +%Y%m%d).db.gz

Python Backup Script

import shutil
import gzip
from datetime import datetime
import os

class DuckDBBackup:
    def __init__(self, db_path):
        self.db_path = db_path
    
    def create_backup(self, backup_dir='/tmp/backups'):
        """Create compressed backup."""
        os.makedirs(backup_dir, exist_ok=True)
        
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        backup_path = os.path.join(backup_dir, f'duckdb_{timestamp}.db')
        
        # Copy database file
        shutil.copy2(self.db_path, backup_path)
        
        # Compress
        compressed = f'{backup_path}.gz'
        with open(backup_path, 'rb') as f_in:
            with gzip.open(compressed, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        
        os.remove(backup_path)
        return compressed
    
    def restore_backup(self, backup_file, target_path=None):
        """Restore from backup."""
        if target_path is None:
            target_path = self.db_path
        
        # Decompress if needed
        if backup_file.endswith('.gz'):
            with gzip.open(backup_file, 'rb') as f_in:
                with open(target_path, 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
        else:
            shutil.copy2(backup_file, target_path)

# Usage
backup = DuckDBBackup('analytics.db')
backup.create_backup()

Export/Import

-- Export specific tables
EXPORT DATABASE 'backup_directory' (
    FORMAT CSV,
    DELIMITER ',',
    HEADER true
);

-- Import database
IMPORT DATABASE 'backup_directory';

Performance Monitoring

Query Profiling

-- Enable profiling
SET enable_profiling = 'query_tree';

-- Run query
SELECT * FROM large_table GROUP BY category;

-- View profiling info
SELECT * FROM duckdb_profiling();

-- Detailed JSON profiling
SET enable_profiling = 'json';
SELECT * FROM orders WHERE date > '2025-01-01';
-- Check profiling output

Statistics

-- Table statistics
PRAGMA table_info('employees');

-- Index information
PRAGMA index_list('employees');

-- Query statistics
SELECT * FROM duckdb_statistics();

Production Patterns

Connection Pooling

import duckdb
import threading
from queue import Queue

class DuckDBPool:
    """Simple connection pool for DuckDB."""
    
    def __init__(self, db_path, pool_size=5):
        self.db_path = db_path
        self.pool = Queue(maxsize=pool_size)
        
        # Pre-create connections
        for _ in range(pool_size):
            conn = duckdb.connect(db_path)
            self.pool.put(conn)
    
    def get_connection(self, timeout=30):
        """Get connection from pool."""
        return self.pool.get(timeout=timeout)
    
    def return_connection(self, conn):
        """Return connection to pool."""
        self.pool.put(conn)
    
    def close_all(self):
        """Close all connections."""
        while not self.pool.empty():
            conn = self.pool.get()
            conn.close()

# Usage
pool = DuckDBPool('analytics.db')
conn = pool.get_connection()
result = conn.execute("SELECT * FROM users").df()
pool.return_connection(conn)

Background Processing

import duckdb
import threading
import queue

class DuckDBWorker:
    """Background query processor."""
    
    def __init__(self, db_path):
        self.db_path = db_path
        self.query_queue = queue.Queue()
        self.result_queue = queue.Queue()
        self.running = True
        self.thread = threading.Thread(target=self._process_queries)
        self.thread.start()
    
    def _process_queries(self):
        """Process queries in background."""
        conn = duckdb.connect(self.db_path)
        
        while self.running:
            try:
                query, args, callback = self.query_queue.get(timeout=1)
                try:
                    result = conn.execute(query, args or [])
                    if callback:
                        callback(result.fetchall())
                    else:
                        self.result_queue.put(result.fetchall())
                except Exception as e:
                    self.result_queue.put(e)
            except queue.Empty:
                continue
        
        conn.close()
    
    def execute(self, query, args=None, callback=None):
        """Submit query for background execution."""
        self.query_queue.put((query, args, callback))
    
    def get_result(self, timeout=30):
        """Get result from last query."""
        return self.result_queue.get(timeout=timeout)
    
    def stop(self):
        """Stop background worker."""
        self.running = False
        self.thread.join()

# Usage
worker = DuckDBWorker('analytics.db')
worker.execute("SELECT COUNT(*) FROM events")
result = worker.get_result()
worker.stop()

Application Integration

Flask Integration

from flask import Flask, request, jsonify
import duckdb

app = Flask(__name__)

def get_db():
    """Get database connection."""
    return duckdb.connect('analytics.db', read_only=True)

@app.route('/api/analytics')
def analytics():
    con = get_db()
    try:
        # Get summary stats
        result = con.execute("""
            SELECT 
                COUNT(*) as total_events,
                COUNT(DISTINCT user_id) as unique_users,
                AVG(duration) as avg_duration
            FROM events
            WHERE created_at > CURRENT_DATE - INTERVAL '30 days'
        """).fetchone()
        
        return jsonify({
            'total_events': result[0],
            'unique_users': result[1],
            'avg_duration': result[2]
        })
    finally:
        con.close()

@app.route('/api/users/<int:user_id>/activity')
def user_activity(user_id):
    con = get_db()
    try:
        df = con.execute("""
            SELECT date, action, duration
            FROM user_activity
            WHERE user_id = ?
            ORDER BY date DESC
            LIMIT 100
        """, [user_id]).df()
        
        return jsonify(df.to_dict(orient='records'))
    finally:
        con.close()

Streamlit Integration

import streamlit as st
import duckdb

# Connect to DuckDB
@st.cache_resource
def get_connection():
    return duckdb.connect('analytics.db')

conn = get_connection()

# Title
st.title("Analytics Dashboard")

# KPI metrics
col1, col2, col3 = st.columns(3)

with col1:
    total = conn.execute("SELECT COUNT(*) FROM orders").fetchone()[0]
    st.metric("Total Orders", f"{total:,}")

with col2:
    revenue = conn.execute("SELECT SUM(amount) FROM orders").fetchone()[0]
    st.metric("Total Revenue", f"${revenue:,.2f}")

with col3:
    avg_order = conn.execute("SELECT AVG(amount) FROM orders").fetchone()[0]
    st.metric("Avg Order Value", f"${avg_order:,.2f}")

# Chart
st.subheader("Orders by Month")
df = conn.execute("""
    SELECT 
        STRFTIME('%Y-%m', order_date) as month,
        COUNT(*) as orders,
        SUM(amount) as revenue
    FROM orders
    GROUP BY month
    ORDER BY month
""").df()

st.line_chart(df.set_index('month')['revenue'])

# Data table
st.subheader("Recent Orders")
recent = conn.execute("""
    SELECT order_id, customer, amount, order_date
    FROM orders
    ORDER BY order_date DESC
    LIMIT 100
""").df()

st.dataframe(recent)

Best Practices

Configuration Checklist

# Production DuckDB configuration
config = {
    'memory_limit': '8GB',          # Adjust based on available RAM
    'threads': 8,                    # Number of CPU cores
    'checkpoint_threshold': '16GB',  # Checkpoint frequency
    'enable_profiling': 'query_tree', # For debugging
    'wal_mode': True,                # Enable WAL for durability
}

Operational Guidelines

-- Regular checkpointing
CHECKPOINT;

-- Monitor storage
SELECT * FROM duckdb_storage_info();

-- Clean up temporary files
-- DuckDB automatically cleans temp files

Common Pitfalls

Pitfall 1: Insufficient Memory

# Bad: Default memory might be too small
con = duckdb.connect('data.db')

# Good: Set appropriate memory limit
con = duckdb.connect('data.db', config={'memory_limit': '16GB'})

Pitfall 2: Not Using Parallelism

-- Default should use all cores, but verify
SELECT current_setting('threads');

-- Increase for heavy workloads
SET threads = 16;

Pitfall 3: Large Results in Memory

# Bad: Load everything into memory
results = con.execute("SELECT * FROM huge_table").fetchall()

# Good: Use pandas integration
df = con.execute("SELECT * FROM huge_table").df()

Resources

Conclusion

DuckDB’s operational simplicity makes it an excellent choice for production analytical workloads. By understanding its configuration options and applying proper memory and parallelism settings, you can achieve excellent performance.

In the next article, we’ll explore DuckDB’s internal architecture, including vectorized execution, columnar storage, and query processing.