Skip to main content
โšก Calmops

Database Indexing Strategies: B-Tree, Hash, GIN, and More

Indexes are the key to database performance. The right index can make queries 1000x faster, while the wrong index wastes storage and slows down writes.

In this guide, we’ll explore different index types, when to use them, and how to optimize your indexing strategy.

Understanding Indexes

How Indexes Work

โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
โ”‚                  Table Scan vs Index Scan                    โ”‚
โ”‚                                                             โ”‚
โ”‚   Table Scan: O(n)                                          โ”‚
โ”‚   โ”Œโ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”                     โ”‚
โ”‚   โ”‚ 1  โ”‚ 2  โ”‚ 3  โ”‚ 4  โ”‚ 5  โ”‚ 6  โ”‚ 7  โ”‚                     โ”‚
โ”‚   โ””โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”˜                     โ”‚
โ”‚   Read ALL rows to find one!                                โ”‚
โ”‚                                                             โ”‚
โ”‚   Index Scan: O(log n)                                      โ”‚
โ”‚   Index: {1โ†’0x1, 2โ†’0x5, 3โ†’0x9, 4โ†’0x2, ...}               โ”‚
โ”‚                                                             โ”‚
โ”‚   B-Tree:                                                   โ”‚
โ”‚                    [5]                                      โ”‚
โ”‚                  /    \                                    โ”‚
โ”‚               [2]      [8]                                 โ”‚
โ”‚              /  \     /  \                                 โ”‚
โ”‚            [1]  [3] [6]  [9]                              โ”‚
โ”‚                                                             โ”‚
โ”‚   Jump directly to the row!                                 โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜

Index Trade-offs

# Index pros and cons

index_tradeoffs = {
    "pros": [
        "Dramatically faster reads",
        "Enable unique constraints",
        "Support range queries",
        "Enable ORDER BY optimization"
    ],
    
    "cons": [
        "Slow down writes (INSERT/UPDATE/DELETE)",
        "Consume storage space",
        "Need maintenance (rebuilds)",
        "Can cause locking during creation"
    ]
}

B-Tree Indexes

How B-Tree Works

# B-Tree (Balanced Tree) - Most common index type

b_tree_characteristics = {
    "structure": "Balanced tree with multiple levels",
    "operations": "O(log n) for search, insert, delete",
    "order": "Sorted, supports range queries",
    "use_when": "General purpose, =, <, >, BETWEEN"
}

# Example B-Tree structure (order 3)
"""
        [30]
       /    \
    [10,20]  [40,50]
    /  |  \   /  |  \
  [5] [15][25][35][45][55]
"""

Creating B-Tree Indexes

-- PostgreSQL
CREATE INDEX idx_users_email ON users(email);
CREATE INDEX idx_orders_date ON orders(created_at DESC);
CREATE INDEX idx_products_price ON products(price) WHERE price > 0;

-- MySQL (B-Tree is default)
CREATE INDEX idx_users_email ON users(email);
CREATE INDEX idx_orders_date ON orders(created_at);

-- Composite index
CREATE INDEX idx_orders_user_date ON orders(user_id, created_at);

Composite Index Order

-- Composite index: (a, b, c)
-- Can be used for:
-- โœ“ WHERE a = 1
-- โœ“ WHERE a = 1 AND b = 2
-- โœ“ WHERE a = 1 AND b = 2 AND c = 3
-- โœ“ WHERE a = 1 AND b > 10

-- Cannot efficiently use:
-- โœ— WHERE b = 2        (a not specified)
-- โœ— WHERE b = 2 AND c = 3  (a not specified)
-- โœ— WHERE c = 3        (a, b not specified)

-- Rule: Left-to-right, equality first, then range

Hash Indexes

How Hash Works

# Hash Index - Key-value lookups

hash_characteristics = {
    "structure": "Hash table",
    "operations": "O(1) average, O(n) worst",
    "order": "Unordered",
    "use_when": "Simple equality (=) only"
}

# Hash collision handling
hash_collision = {
    "chaining": "Linked list at each bucket",
    "open_addressing": "Find next empty slot",
    "rehashing": "New hash function"
}

Creating Hash Indexes

-- PostgreSQL
CREATE INDEX idx_users_email_hash ON users USING HASH (email);

-- MySQL (MEMORY/InnoDB adaptive hash)
-- Not manually created, auto-tuned

-- Redis
HSET user:1 name "John"
HGET user:1 name

GIN Indexes

When to Use GIN

# GIN (Generalized Inverted Index)

gin_characteristics = {
    "for": [
        "Full-text search",
        "JSONB columns",
        "Arrays",
        "HSTORE (key-value)"
    ],
    
    "structure": "Inverted index - maps values to locations",
    "write_performance": "Slower than B-Tree",
    "read_performance": "Much faster for specific types"
}

GIN Examples

-- Full-text search
CREATE INDEX idx_articles_fts ON articles 
USING GIN(to_tsvector('english', title || ' ' || body));

-- Query
SELECT * FROM articles 
WHERE to_tsvector('english', title || ' ' || body) 
@@ to_tsquery('english', 'python & tutorial');

-- JSONB
CREATE INDEX idx_orders_data ON orders USING GIN(data);

-- Query JSON
SELECT * FROM orders 
WHERE data->>'status' = 'shipped';

-- Array
CREATE INDEX idx_tags ON articles USING GIN(tags);

-- Query array
SELECT * FROM articles 
WHERE tags && ARRAY['python', 'tutorial'];

GiST Indexes

When to Use GiST

# GiST (Generalized Search Tree)

gist_characteristics = {
    "for": [
        "Geospatial data (PostGIS)",
        "Range types",
        "Full-text search (tsvector)",
        "Custom data types"
    ],
    
    "structure": "Balanced tree, stores bounding boxes",
    "flexible": "Can be extended for new data types"
}

GiST Examples

-- PostGIS geospatial
CREATE INDEX idx_locations ON locations USING GIST(geom);

-- Range types
CREATE INDEX idx_reservations ON reservations 
USING GIST(daterange);

-- Query ranges
SELECT * FROM reservations 
WHERE daterange '[2025-01-01, 2025-01-07)' 
&& daterange '[2025-01-05, 2025-01-10)';

Index Strategies

Choosing Indexes

# Index selection strategy

def analyze_queries_for_indexing(queries):
    """Analyze which columns need indexes"""
    
    recommendations = []
    
    for query in queries:
        # Check WHERE clauses
        for condition in query.where_conditions:
            if condition.type == "equality":
                recommendations.append({
                    "column": condition.column,
                    "type": "B-Tree",
                    "reason": "Equality filter"
                })
            
            elif condition.type == "range":
                recommendations.append({
                    "column": condition.column,
                    "type": "B-Tree",
                    "reason": "Range query"
                })
            
            elif condition.type == "text_search":
                recommendations.append({
                    "column": condition.column,
                    "type": "GIN",
                    "reason": "Full-text search"
                })
        
        # Check JOINs
        for join in query.joins:
            recommendations.append({
                "column": join.column,
                "type": "B-Tree",
                "reason": "JOIN condition"
            })
        
        # Check ORDER BY
        for order in query.order_by:
            recommendations.append({
                "column": order.column,
                "type": "B-Tree",
                "reason": "ORDER BY"
            })
    
    return recommendations

Covering Indexes

-- Covering index - include all columns needed

-- Instead of just indexing
CREATE INDEX idx_orders_user_id ON orders(user_id);

-- Create covering index
CREATE INDEX idx_orders_user_id 
ON orders(user_id) 
INCLUDE (created_at, total, status);

-- Query can be satisfied entirely from index
SELECT created_at, total, status 
FROM orders 
WHERE user_id = 123;
-- No table lookup needed! (Index Only Scan)

Partial Indexes

-- Index only some rows

-- Common: Active records
CREATE INDEX idx_active_users 
ON users(last_login) 
WHERE status = 'active';

-- Query that uses it
SELECT * FROM users 
WHERE status = 'active' 
ORDER BY last_login DESC;

-- Won't use index
SELECT * FROM users 
WHERE status = 'inactive';

Index Maintenance

Monitoring Index Usage

-- PostgreSQL - Find unused indexes
SELECT 
    schemaname,
    relname,
    indexrelname,
    idx_scan,
    idx_tup_read,
    idx_tup_fetch,
    pg_size_pretty(pg_relation_size(indexrelid)) as index_size
FROM pg_stat_user_indexes
WHERE idx_scan = 0
AND pg_relation_size(indexrelid) > 1024 * 1024;

-- MySQL - Find unused indexes
SELECT 
    object_schema,
    object_name,
    index_name
FROM performance_schema.table_io_waits_summary_by_index_usage
WHERE index_name IS NOT NULL
AND count_star = 0
AND object_schema = 'mydb';

-- Analyze query plans
EXPLAIN ANALYZE 
SELECT * FROM users WHERE email = '[email protected]';

Index Rebuilding

-- PostgreSQL - REINDEX
REINDEX INDEX CONCURRENTLY idx_users_email;
REINDEX TABLE CONCURRENTLY users;

-- MySQL
OPTIMIZE TABLE users;

-- When to rebuild:
-- 1. Index bloat (too much empty space)
-- 2. After many DELETEs
-- 3. If index size is growing faster than table

Common Mistakes

# Index mistakes to avoid

mistakes = {
    "too_many_indexes": {
        "problem": "Every write updates all indexes",
        "solution": "Profile writes vs reads first"
    },
    
    "wrong_column_order": {
        "problem": "Composite index not used",
        "solution": "Equality columns first, then range"
    },
    
    "function_on_indexed_column": {
        "problem": "WHERE LOWER(email) = 'john'",
        "solution": "Use expression index: idx ON LOWER(email)"
    },
    
    "not_using_index": {
        "problem": "Small table, full scan faster",
        "solution": "Check with EXPLAIN"
    },
    
    "foreign_keys_no_index": {
        "problem": "Slow JOINs on foreign keys",
        "solution": "Always index foreign key columns"
    }
}

# Good example
-- Table: orders(user_id, status, created_at)
CREATE INDEX idx_orders_query 
ON orders(user_id, status, created_at);

-- Query uses it efficiently
SELECT * FROM orders 
WHERE user_id = 123 
AND status = 'shipped' 
ORDER BY created_at DESC;

Query Optimization

# Use EXPLAIN to analyze

explain_analysis = {
    "good": [
        "Index Only Scan",
        "Index Scan using idx_xxx"
    ],
    
    "acceptable": [
        "Seq Scan on small table",
        "Bitmap Heap Scan"
    ],
    
    "bad": [
        "Seq Scan on large table",
        "Nested Loop with large inner",
        "Index Scan Backward"
    ]
}

# Example analysis
EXPLAIN (ANALYZE, BUFFERS) 
SELECT * FROM users 
WHERE email = '[email protected]';

-- Look for:
-- Execution time (< 1ms is good)
-- Rows (actual vs planned)
-- Index usage
-- Buffer hits vs reads

Conclusion

Indexing is crucial for database performance:

  • B-Tree: Default for most use cases
  • Hash: Fast equality, no range queries
  • GIN: Full-text, JSON, arrays
  • GiST: Geospatial, custom types

Profile your queries, use covering indexes, and monitor for unused indexes.


Comments