Introduction
Tries (prefix trees) and hash tables are fundamental data structures for efficient string operations and key-value storage respectively. This guide covers their implementations, trade-offs, and practical applications.
Trie (Prefix Tree)
A trie is a tree-based data structure for efficient string operations - storing and searching strings in O(m) time where m is the string length.
Trie Structure
Insert: "cat", "cap", "dog", "do"
(root)
/ | \
c d o
/ \ | |
a a o g
/ \
t p
- Each node represents a character
- Path from root to node = prefix
- Boolean marker indicates end of word
Implementation
class TrieNode:
def __init__(self):
self.children = {}
self.is_end = False
self.count = 0 # Number of words ending at this node
class Trie:
def __init__(self):
self.root = TrieNode()
def insert(self, word):
"""Insert a word into the trie. O(m)"""
node = self.root
for char in word:
if char not in node.children:
node.children[char] = TrieNode()
node = node.children[char]
node.is_end = True
node.count += 1
def search(self, word):
"""Search for exact word. O(m)"""
node = self._find_node(word)
return node is not None and node.is_end
def starts_with(self, prefix):
"""Check if any word starts with prefix. O(m)"""
return self._find_node(prefix) is not None
def _find_node(self, prefix):
"""Find node corresponding to prefix."""
node = self.root
for char in prefix:
if char not in node.children:
return None
node = node.children[char]
return node
def autocomplete(self, prefix):
"""Find all words starting with prefix."""
node = self._find_node(prefix)
if not node:
return []
results = []
self._collect_words(node, prefix, results)
return results
def _collect_words(self, node, prefix, results):
"""Recursively collect all words from node."""
if node.is_end:
results.append(prefix)
for char, child in node.children.items():
self._collect_words(child, prefix + char, results)
def delete(self, word):
"""Delete word from trie. O(m)"""
self._delete_recursive(self.root, word, 0)
def _delete_recursive(self, node, word, depth):
if depth == len(word):
if not node.is_end:
return False
node.is_end = False
node.count = 0
return len(node.children) == 0
char = word[depth]
if char not in node.children:
return False
should_delete_child = self._delete_recursive(
node.children[char], word, depth + 1
)
if should_delete_child:
del node.children[char]
return len(node.children) == 0 and not node.is_end
return False
def word_count(self, prefix):
"""Count how many words start with prefix."""
node = self._find_node(prefix)
return node.count if node else 0
def longest_common_prefix(self, words):
"""Find longest common prefix among words."""
if not words:
return ""
prefix = ""
node = self.root
while len(node.children) == 1 and not node.is_end:
char = list(node.children.keys())[0]
prefix += char
node = node.children[char]
return prefix
# Example usage
trie = Trie()
words = ["cat", "cap", "car", "dog", "do", "done", "running"]
for word in words:
trie.insert(word)
print(trie.search("cat")) # True
print(trie.search("ca")) # False
print(trie.starts_with("ca")) # True
print(trie.autocomplete("do")) # ['do', 'dog', 'done']
print(trie.longest_common_prefix(["flower", "flow", "flight"])) # "fl"
Practical Trie Applications
1. Autocomplete / Type-ahead
class AutocompleteSystem:
def __init__(self, sentences, times):
self.trie = Trie()
self.hot = {} # sentence -> frequency
for sentence, time in zip(sentences, times):
self.trie.insert(sentence)
self.hot[sentence] = time
def input(self, c):
"""Process each character of input."""
if c == '#':
# Save current query
word = ''.join(self.current)
self.trie.insert(word)
self.hot[word] = self.hot.get(word, 0) + 1
self.current = []
self.results = []
return []
# Build prefix
if not hasattr(self, 'current'):
self.current = []
self.current.append(c)
prefix = ''.join(self.current)
# Find matching sentences
matches = []
self._find_matches(self.trie.root, prefix, "", matches)
# Sort by frequency
matches.sort(key=lambda x: (-self.hot.get(x, 0), x))
return matches[:3]
def _find_matches(self, node, prefix, word, results):
if not results or len(results) < 3:
if node.is_end:
results.append(prefix + word)
for char, child in node.children.items():
self._find_matches(child, prefix, word + char, results)
2. Word Search II
def find_words(board, words):
"""Find all words from list in 2D board."""
trie = Trie()
for word in words:
trie.insert(word)
result = set()
rows, cols = len(board), len(board[0])
def dfs(r, c, node, path):
char = board[r][c]
if char not in node.children:
return
next_node = node.children[char]
new_path = path + char
if next_node.is_end:
result.add(new_path)
board[r][c] = '#' # Mark visited
for dr, dc in [(0, 1), (0, -1), (1, 0), (-1, 0)]:
nr, nc = r + dr, c + dc
if 0 <= nr < rows and 0 <= nc < cols:
dfs(nr, nc, next_node, new_path)
board[r][c] = char # Restore
for r in range(rows):
for c in range(cols):
dfs(r, c, trie.root, "")
return list(result)
# Example
board = [
['o', 'a', 'a', 'n'],
['e', 't', 'a', 'e'],
['i', 'h', 'k', 'r'],
['i', 'f', 'l', 'v']
]
words = ["oath", "pea", "eat", "rain"]
print(find_words(board, words)) # ["oath", "eat"]
Hash Table
A hash table stores key-value pairs using a hash function to compute an index into an array of buckets.
Hash Function Requirements
- Deterministic: Same key always produces same hash
- Uniform distribution: Keys spread evenly across buckets
- Fast computation: Hash function should be efficient
Hash Table Implementation
class HashTable:
def __init__(self, size=100):
self.size = size
self.buckets = [[] for _ in range(size)] # Separate chaining
def _hash(self, key):
"""Hash function."""
if isinstance(key, str):
hash_val = 0
for i, char in enumerate(key):
hash_val = (hash_val * 31 + ord(char)) % self.size
return hash_val
return hash(key) % self.size
def put(self, key, value):
"""Insert or update key-value pair. O(1) average."""
index = self._hash(key)
bucket = self.buckets[index]
for i, (k, v) in enumerate(bucket):
if k == key:
bucket[i] = (key, value)
return
bucket.append((key, value))
def get(self, key):
"""Get value by key. O(1) average."""
index = self._hash(key)
for k, v in self.buckets[index]:
if k == key:
return v
return None
def delete(self, key):
"""Delete key-value pair. O(1) average."""
index = self._hash(key)
bucket = self.buckets[index]
for i, (k, v) in enumerate(bucket):
if k == key:
del bucket[i]
return True
return False
def contains(self, key):
"""Check if key exists. O(1) average."""
return self.get(key) is not None
def keys(self):
"""Return all keys."""
result = []
for bucket in self.buckets:
for k, v in bucket:
result.append(k)
return result
def values(self):
"""Return all values."""
result = []
for bucket in self.buckets:
for k, v in bucket:
result.append(v)
return result
def items(self):
"""Return all key-value pairs."""
result = []
for bucket in self.buckets:
result.extend(bucket)
return result
def __len__(self):
return sum(len(bucket) for bucket in self.buckets)
Collision Handling
# 1. Separate Chaining (used above)
# - Each bucket contains a linked list or balanced tree
# - Simple to implement
# - Degrades gracefully with collisions
# 2. Open Addressing
class HashTableOpenAddressing:
def __init__(self, size=100):
self.size = size
self.keys = [None] * size
self.values = [None] * size
self.tombstones = [False] * size
def _hash(self, key):
return hash(key) % self.size
def _probe(self, key, index):
"""Linear probing: index = (hash + i) % size"""
i = 0
while i < self.size:
idx = (index + i) % self.size
if self.keys[idx] is None and not self.tombstones[idx]:
return idx, False # Empty slot found
if self.keys[idx] == key:
return idx, True # Key found
i += 1
return -1, False # Table full
def put(self, key, value):
index = self._hash(key)
idx, found = self._probe(key, index)
if idx == -1:
raise Exception("Hash table is full")
self.keys[idx] = key
self.values[idx] = value
self.tombstones[idx] = False
def get(self, key):
index = self._hash(key)
idx, found = self._probe(key, index)
if found:
return self.values[idx]
return None
def delete(self, key):
index = self._hash(key)
idx, found = self._probe(key, index)
if found:
self.keys[idx] = None
self.values[idx] = None
self.tombstones[idx] = True
return True
return False
Hash Table vs Trie Comparison
| Feature | Hash Table | Trie |
|---|---|---|
| Search | O(1) average | O(m) |
| Insert | O(1) average | O(m) |
| Delete | O(1) average | O(m) |
| Space | O(n) | O(ALPHABET * m * n) |
| Prefix search | Not efficient | O(m) |
| Sorted order | No | Yes (in-order) |
| Autocomplete | Limited | Excellent |
When to Use Which
Use Trie When:
- Prefix-based operations (autocomplete, search suggestions)
- Finding all words starting with a pattern
- IP routing (Longest Prefix Match)
- Dictionary/spell checker
- Phone number directory
Use Hash Table When:
- Key-value lookups
- Counting frequencies
- Caching/memoization
- Set membership testing
- General-purpose data storage
Python Built-in Types
# Dictionary is a hash table
d = {}
d["key"] = "value"
print(d.get("key")) # "value"
# Set uses hash table (no values)
s = {1, 2, 3}
print(1 in s) # True
# Collections.defaultdict
from collections import defaultdict
dd = defaultdict(list)
dd["key"].append(1)
# OrderedDict (Python 3.7+ dict maintains insertion order)
from collections import OrderedDict
Custom Hash Function Examples
def string_hash(s, size):
"""Polynomial rolling hash."""
hash_val = 0
prime = 31
for char in s:
hash_val = (hash_val * prime + ord(char)) % size
return hash_val
def custom_hash(obj, size):
"""Hash for custom objects."""
if isinstance(obj, tuple):
return hash(obj) % size
if isinstance(obj, list):
return sum(hash(x) for x in obj) % size
return hash(obj) % size
Load Factor and Rehashing
class DynamicHashTable:
def __init__(self, initial_size=16, load_factor_threshold=0.75):
self.size = initial_size
self.threshold = load_factor_threshold
self.count = 0
self.buckets = [[] for _ in range(self.size)]
def _resize(self):
"""Double size and rehash when load factor exceeded."""
old_buckets = self.buckets
self.size *= 2
self.buckets = [[] for _ in range(self.size)]
self.count = 0
for bucket in old_buckets:
for key, value in bucket:
self.put(key, value)
@property
def load_factor(self):
return self.count / self.size
def put(self, key, value):
if self.load_factor > self.threshold:
self._resize()
index = hash(key) % self.size
bucket = self.buckets[index]
for i, (k, v) in enumerate(bucket):
if k == key:
bucket[i] = (key, value)
return
bucket.append((key, value))
self.count += 1
Conclusion
Both tries and hash tables are essential data structures with distinct use cases. Tries excel at prefix-based operations and string manipulation problems, while hash tables provide fast key-value lookups. Understanding when to use each will help you choose the right tool for your algorithmic problems.
Comments