Skip to main content
โšก Calmops

Tries and Hash Tables: Complete Guide

Introduction

Tries (prefix trees) and hash tables are fundamental data structures for efficient string operations and key-value storage respectively. This guide covers their implementations, trade-offs, and practical applications.

Trie (Prefix Tree)

A trie is a tree-based data structure for efficient string operations - storing and searching strings in O(m) time where m is the string length.

Trie Structure

Insert: "cat", "cap", "dog", "do"

        (root)
       /  |  \
     c    d    o
    / \   |   |
   a   a  o   g
  /     \     
 t       p   

- Each node represents a character
- Path from root to node = prefix
- Boolean marker indicates end of word

Implementation

class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_end = False
        self.count = 0  # Number of words ending at this node


class Trie:
    def __init__(self):
        self.root = TrieNode()
    
    def insert(self, word):
        """Insert a word into the trie. O(m)"""
        node = self.root
        for char in word:
            if char not in node.children:
                node.children[char] = TrieNode()
            node = node.children[char]
        node.is_end = True
        node.count += 1
    
    def search(self, word):
        """Search for exact word. O(m)"""
        node = self._find_node(word)
        return node is not None and node.is_end
    
    def starts_with(self, prefix):
        """Check if any word starts with prefix. O(m)"""
        return self._find_node(prefix) is not None
    
    def _find_node(self, prefix):
        """Find node corresponding to prefix."""
        node = self.root
        for char in prefix:
            if char not in node.children:
                return None
            node = node.children[char]
        return node
    
    def autocomplete(self, prefix):
        """Find all words starting with prefix."""
        node = self._find_node(prefix)
        if not node:
            return []
        
        results = []
        self._collect_words(node, prefix, results)
        return results
    
    def _collect_words(self, node, prefix, results):
        """Recursively collect all words from node."""
        if node.is_end:
            results.append(prefix)
        
        for char, child in node.children.items():
            self._collect_words(child, prefix + char, results)
    
    def delete(self, word):
        """Delete word from trie. O(m)"""
        self._delete_recursive(self.root, word, 0)
    
    def _delete_recursive(self, node, word, depth):
        if depth == len(word):
            if not node.is_end:
                return False
            node.is_end = False
            node.count = 0
            return len(node.children) == 0
        
        char = word[depth]
        if char not in node.children:
            return False
        
        should_delete_child = self._delete_recursive(
            node.children[char], word, depth + 1
        )
        
        if should_delete_child:
            del node.children[char]
            return len(node.children) == 0 and not node.is_end
        
        return False
    
    def word_count(self, prefix):
        """Count how many words start with prefix."""
        node = self._find_node(prefix)
        return node.count if node else 0
    
    def longest_common_prefix(self, words):
        """Find longest common prefix among words."""
        if not words:
            return ""
        
        prefix = ""
        node = self.root
        
        while len(node.children) == 1 and not node.is_end:
            char = list(node.children.keys())[0]
            prefix += char
            node = node.children[char]
        
        return prefix


# Example usage
trie = Trie()
words = ["cat", "cap", "car", "dog", "do", "done", "running"]

for word in words:
    trie.insert(word)

print(trie.search("cat"))       # True
print(trie.search("ca"))        # False
print(trie.starts_with("ca"))   # True
print(trie.autocomplete("do"))  # ['do', 'dog', 'done']
print(trie.longest_common_prefix(["flower", "flow", "flight"]))  # "fl"

Practical Trie Applications

1. Autocomplete / Type-ahead

class AutocompleteSystem:
    def __init__(self, sentences, times):
        self.trie = Trie()
        self.hot = {}  # sentence -> frequency
        
        for sentence, time in zip(sentences, times):
            self.trie.insert(sentence)
            self.hot[sentence] = time
    
    def input(self, c):
        """Process each character of input."""
        if c == '#':
            # Save current query
            word = ''.join(self.current)
            self.trie.insert(word)
            self.hot[word] = self.hot.get(word, 0) + 1
            self.current = []
            self.results = []
            return []
        
        # Build prefix
        if not hasattr(self, 'current'):
            self.current = []
        self.current.append(c)
        prefix = ''.join(self.current)
        
        # Find matching sentences
        matches = []
        self._find_matches(self.trie.root, prefix, "", matches)
        
        # Sort by frequency
        matches.sort(key=lambda x: (-self.hot.get(x, 0), x))
        
        return matches[:3]
    
    def _find_matches(self, node, prefix, word, results):
        if not results or len(results) < 3:
            if node.is_end:
                results.append(prefix + word)
            
            for char, child in node.children.items():
                self._find_matches(child, prefix, word + char, results)

2. Word Search II

def find_words(board, words):
    """Find all words from list in 2D board."""
    trie = Trie()
    for word in words:
        trie.insert(word)
    
    result = set()
    rows, cols = len(board), len(board[0])
    
    def dfs(r, c, node, path):
        char = board[r][c]
        if char not in node.children:
            return
        
        next_node = node.children[char]
        new_path = path + char
        
        if next_node.is_end:
            result.add(new_path)
        
        board[r][c] = '#'  # Mark visited
        
        for dr, dc in [(0, 1), (0, -1), (1, 0), (-1, 0)]:
            nr, nc = r + dr, c + dc
            if 0 <= nr < rows and 0 <= nc < cols:
                dfs(nr, nc, next_node, new_path)
        
        board[r][c] = char  # Restore
    
    for r in range(rows):
        for c in range(cols):
            dfs(r, c, trie.root, "")
    
    return list(result)


# Example
board = [
    ['o', 'a', 'a', 'n'],
    ['e', 't', 'a', 'e'],
    ['i', 'h', 'k', 'r'],
    ['i', 'f', 'l', 'v']
]
words = ["oath", "pea", "eat", "rain"]
print(find_words(board, words))  # ["oath", "eat"]

Hash Table

A hash table stores key-value pairs using a hash function to compute an index into an array of buckets.

Hash Function Requirements

  1. Deterministic: Same key always produces same hash
  2. Uniform distribution: Keys spread evenly across buckets
  3. Fast computation: Hash function should be efficient

Hash Table Implementation

class HashTable:
    def __init__(self, size=100):
        self.size = size
        self.buckets = [[] for _ in range(size)]  # Separate chaining
    
    def _hash(self, key):
        """Hash function."""
        if isinstance(key, str):
            hash_val = 0
            for i, char in enumerate(key):
                hash_val = (hash_val * 31 + ord(char)) % self.size
            return hash_val
        return hash(key) % self.size
    
    def put(self, key, value):
        """Insert or update key-value pair. O(1) average."""
        index = self._hash(key)
        bucket = self.buckets[index]
        
        for i, (k, v) in enumerate(bucket):
            if k == key:
                bucket[i] = (key, value)
                return
        
        bucket.append((key, value))
    
    def get(self, key):
        """Get value by key. O(1) average."""
        index = self._hash(key)
        
        for k, v in self.buckets[index]:
            if k == key:
                return v
        
        return None
    
    def delete(self, key):
        """Delete key-value pair. O(1) average."""
        index = self._hash(key)
        bucket = self.buckets[index]
        
        for i, (k, v) in enumerate(bucket):
            if k == key:
                del bucket[i]
                return True
        
        return False
    
    def contains(self, key):
        """Check if key exists. O(1) average."""
        return self.get(key) is not None
    
    def keys(self):
        """Return all keys."""
        result = []
        for bucket in self.buckets:
            for k, v in bucket:
                result.append(k)
        return result
    
    def values(self):
        """Return all values."""
        result = []
        for bucket in self.buckets:
            for k, v in bucket:
                result.append(v)
        return result
    
    def items(self):
        """Return all key-value pairs."""
        result = []
        for bucket in self.buckets:
            result.extend(bucket)
        return result
    
    def __len__(self):
        return sum(len(bucket) for bucket in self.buckets)

Collision Handling

# 1. Separate Chaining (used above)
# - Each bucket contains a linked list or balanced tree
# - Simple to implement
# - Degrades gracefully with collisions

# 2. Open Addressing
class HashTableOpenAddressing:
    def __init__(self, size=100):
        self.size = size
        self.keys = [None] * size
        self.values = [None] * size
        self.tombstones = [False] * size
    
    def _hash(self, key):
        return hash(key) % self.size
    
    def _probe(self, key, index):
        """Linear probing: index = (hash + i) % size"""
        i = 0
        while i < self.size:
            idx = (index + i) % self.size
            
            if self.keys[idx] is None and not self.tombstones[idx]:
                return idx, False  # Empty slot found
            
            if self.keys[idx] == key:
                return idx, True  # Key found
            
            i += 1
        
        return -1, False  # Table full
    
    def put(self, key, value):
        index = self._hash(key)
        idx, found = self._probe(key, index)
        
        if idx == -1:
            raise Exception("Hash table is full")
        
        self.keys[idx] = key
        self.values[idx] = value
        self.tombstones[idx] = False
    
    def get(self, key):
        index = self._hash(key)
        idx, found = self._probe(key, index)
        
        if found:
            return self.values[idx]
        return None
    
    def delete(self, key):
        index = self._hash(key)
        idx, found = self._probe(key, index)
        
        if found:
            self.keys[idx] = None
            self.values[idx] = None
            self.tombstones[idx] = True
            return True
        return False

Hash Table vs Trie Comparison

Feature Hash Table Trie
Search O(1) average O(m)
Insert O(1) average O(m)
Delete O(1) average O(m)
Space O(n) O(ALPHABET * m * n)
Prefix search Not efficient O(m)
Sorted order No Yes (in-order)
Autocomplete Limited Excellent

When to Use Which

Use Trie When:

  • Prefix-based operations (autocomplete, search suggestions)
  • Finding all words starting with a pattern
  • IP routing (Longest Prefix Match)
  • Dictionary/spell checker
  • Phone number directory

Use Hash Table When:

  • Key-value lookups
  • Counting frequencies
  • Caching/memoization
  • Set membership testing
  • General-purpose data storage

Python Built-in Types

# Dictionary is a hash table
d = {}
d["key"] = "value"
print(d.get("key"))  # "value"

# Set uses hash table (no values)
s = {1, 2, 3}
print(1 in s)  # True

# Collections.defaultdict
from collections import defaultdict
dd = defaultdict(list)
dd["key"].append(1)

# OrderedDict (Python 3.7+ dict maintains insertion order)
from collections import OrderedDict

Custom Hash Function Examples

def string_hash(s, size):
    """Polynomial rolling hash."""
    hash_val = 0
    prime = 31
    
    for char in s:
        hash_val = (hash_val * prime + ord(char)) % size
    
    return hash_val


def custom_hash(obj, size):
    """Hash for custom objects."""
    if isinstance(obj, tuple):
        return hash(obj) % size
    if isinstance(obj, list):
        return sum(hash(x) for x in obj) % size
    return hash(obj) % size

Load Factor and Rehashing

class DynamicHashTable:
    def __init__(self, initial_size=16, load_factor_threshold=0.75):
        self.size = initial_size
        self.threshold = load_factor_threshold
        self.count = 0
        self.buckets = [[] for _ in range(self.size)]
    
    def _resize(self):
        """Double size and rehash when load factor exceeded."""
        old_buckets = self.buckets
        self.size *= 2
        self.buckets = [[] for _ in range(self.size)]
        self.count = 0
        
        for bucket in old_buckets:
            for key, value in bucket:
                self.put(key, value)
    
    @property
    def load_factor(self):
        return self.count / self.size
    
    def put(self, key, value):
        if self.load_factor > self.threshold:
            self._resize()
        
        index = hash(key) % self.size
        bucket = self.buckets[index]
        
        for i, (k, v) in enumerate(bucket):
            if k == key:
                bucket[i] = (key, value)
                return
        
        bucket.append((key, value))
        self.count += 1

Conclusion

Both tries and hash tables are essential data structures with distinct use cases. Tries excel at prefix-based operations and string manipulation problems, while hash tables provide fast key-value lookups. Understanding when to use each will help you choose the right tool for your algorithmic problems.

Comments