Vector databases have emerged as a critical infrastructure for AI applications. They enable efficient similarity search over high-dimensional embeddings, powering applications from semantic search to recommendation systems.
In this guide, we’ll explore vector databases, how they work, and compare leading solutions.
Understanding Vector Embeddings
What Are Embeddings?
# Text embeddings convert text to numerical vectors
# that capture semantic meaning
# Example: Word2Vec style embeddings
word_embeddings = {
"king": [0.9, 0.1, 0.3, ...],
"queen": [0.85, 0.12, 0.28, ...],
"man": [0.7, 0.2, 0.4, ...],
"woman": [0.68, 0.22, 0.38, ...],
"apple": [0.1, 0.8, 0.2, ...],
"orange": [0.12, 0.78, 0.18, ...]
}
# Similar words have similar vectors
# king - man + woman โ queen
Creating Embeddings
# Using OpenAI's text-embedding-ada-002
import openai
def get_embedding(text):
response = openai.Embedding.create(
model="text-embedding-ada-002",
input=text
)
return response['data'][0]['embedding']
# Example embeddings
texts = [
"The cat sat on the mat",
"A feline resting on a rug",
"The dog played in the park",
"Machine learning is AI"
]
embeddings = [get_embedding(text) for text in texts]
print(f"Embedding dimension: {len(embeddings[0])}") # 1536
# Similarity calculation
import numpy as np
def cosine_similarity(a, b):
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
# "cat" and "feline" should be similar
print(cosine_similarity(embeddings[0], embeddings[1])) # High similarity
print(cosine_similarity(embeddings[0], embeddings[2])) # Low similarity
Vector Database Concepts
Similarity Metrics
# Different distance/similarity metrics
similarity_metrics = {
"cosine_similarity": {
"description": "Angle between vectors",
"best_for": "Text embeddings, varying lengths",
"formula": "cos(ฮธ) = (AยทB) / (||A|| ร ||B||)"
},
"euclidean_distance": {
"description": "Straight-line distance",
"best_for": "Dense numerical data",
"formula": "โ(ฮฃ(Ai - Bi)ยฒ)"
},
"dot_product": {
"description": "Projection of one vector onto another",
"best_for": "Normalized embeddings",
"formula": "ฮฃ(Ai ร Bi)"
},
"manhattan_distance": {
"description": "Sum of absolute differences",
"best_for": "High-dimensional sparse data",
"formula": "ฮฃ|Ai - Bi|"
}
}
Indexing Methods
# Vector indexing for fast similarity search
indexing_methods = {
"brute_force": {
"description": "Compare against all vectors",
"complexity": "O(n ร d)",
"accuracy": "100%",
"use_when": "Small datasets (<10K)"
},
"hnsw": {
"description": "Hierarchical Navigable Small World",
"complexity": "O(log n)",
"accuracy": "~95-99%",
"use_when": "General purpose, balanced"
},
"ivf": {
"description": "Inverted File Index",
"complexity": "O(log n)",
"accuracy": "~90-95%",
"use_when": "Very large datasets"
},
"pq": {
"description": "Product Quantization",
"complexity": "O(1) lookup",
"accuracy": "~80-90%",
"use_when": "Memory constrained"
}
}
Pinecone
Overview
# Pinecone Characteristics
type: "Managed vector database"
deployment: "Cloud (AWS, GCP, Azure)"
pricing: "Free tier, then pay-per-use"
index_type: "Proprietary (SOTA)"
scalability: "Serverless, auto-scaling"
Python Client
import pinecone
# Initialize
pinecone.init(api_key="YOUR_API_KEY", environment="us-west1-gcp")
# Create index
pinecone.create_index(
name="semantic-search",
dimension=1536,
metric="cosine",
pod_type="p1"
)
# Connect to index
index = pinecone.Index("semantic-search")
# Upsert vectors
vectors = [
{"id": "vec1", "values": [0.1] * 1536, "metadata": {"text": "Hello world"}},
{"id": "vec2", "values": [0.2] * 1536, "metadata": {"text": "Goodbye world"}},
{"id": "vec3", "values": [0.3] * 1536, "metadata": {"text": "Hello there"}}
]
index.upsert(vectors)
# Query
query_vector = [0.15] * 1536
results = index.query(
vector=query_vector,
top_k=2,
include_metadata=True
)
print(results)
# Delete
index.delete(ids=["vec1"])
Use Cases
pinecone_use_cases = {
"semantic_search": {
"description": "Search by meaning, not keywords",
"example": "E-commerce product search"
},
"recommendations": {
"description": "Find similar items",
"example": "Movie recommendations"
},
"anomaly_detection": {
"description": "Find unusual patterns",
"example": "Fraud detection"
},
"classification": {
"description": "KNN-based classification",
"example": "Document categorization"
}
}
Weaviate
Overview
# Weaviate Characteristics
type: "Open-source vector database"
deployment: "Self-hosted, cloud (Weaviate Cloud)"
index_type: "Graph-based, HNSW"
features: "Native ML models, filters, CRUD"
Python Client
import weaviate
from weaviate import EmbeddedOptions
# Connect to local Weaviate
client = weaviate.Client(
url="http://localhost:8080",
additional_headers={
"X-OpenAI-Api-Key": "YOUR_OPENAI_KEY"
}
)
# Define schema
schema = {
"class": "Article",
"description": "News articles",
"vectorizer": "text2vec-transformers",
"moduleConfig": {
"text2vec-transformers": {
"vectorizeClassName": False
}
},
"properties": [
{"name": "title", "dataType": ["text"]},
{"name": "content", "dataType": ["text"]},
{"name": "category", "dataType": ["text"]}
]
}
client.schema.create_class(schema)
# Add data
data_object = {
"title": "AI Revolution",
"content": "Machine learning is transforming...",
"category": "technology"
}
client.data_object.create(
data_object,
class_name="Article"
)
# Search
result = client.query.get(
"Article",
["title", "content", "category"]
).with_near_text({
"concepts": ["artificial intelligence"]
}).with_limit(5).do()
print(result)
# With custom vectors
client.data_object.create(
data_object={"title": "Custom"},
class_name="Article",
vector=[0.1] * 1536 # Custom embedding
)
BM25 and Hybrid Search
# Weaviate supports hybrid search (BM25 + vector)
result = client.query.get(
"Article",
["title", "content"]
).with_hybrid(
query="artificial intelligence",
alpha=0.5 # 0 = keyword, 1 = vector
).with_limit(5).do()
Chroma
Overview
# Chroma Characteristics
type: "Open-source, embedded"
deployment: "In-memory, local, client-side"
index_type: "HNSW (via hnswlib)"
features: "Simple API, lightweight"
Python Client
import chromadb
from chromadb.config import Settings
# Initialize (in-memory)
client = chromadb.Client(Settings(
anonymized_telemetry=False,
allow_reset=True
))
# Create collection
collection = client.create_collection(
name="documents",
metadata={"hnsw:space": "cosine"}
)
# Add documents
collection.add(
documents=[
"The cat sat on the mat",
"A feline resting on a rug",
"The dog played in the park",
"Machine learning is AI"
],
ids=["doc1", "doc2", "doc3", "doc4"],
metadatas=[
{"source": "book1", "category": "animals"},
{"source": "book1", "category": "animals"},
{"source": "book2", "category": "animals"},
{"source": "book3", "category": "tech"}
]
)
# Query
results = collection.query(
query_texts=["cat resting"],
n_results=2,
where={"category": "animals"}, # Filter
include=["documents", "distances"]
)
print(results)
# Query by vector
results = collection.query(
query_embeddings=[[0.1] * 768],
n_results=2
)
Persistence
# Persistent storage
client = chromadb.PersistentClient(
path="./chroma_data"
)
# With embedding function
from chromadb.utils import embedding_functions
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
api_key="YOUR_KEY",
model_name="text-embedding-ada-002"
)
collection = client.create_collection(
name="documents",
embedding_function=openai_ef
)
Comparison Matrix
| Feature | Pinecone | Weaviate | Chroma |
|---|---|---|---|
| Type | Managed | Open-source | Open-source |
| Deployment | Cloud only | Self-hosted + Cloud | Embedded |
| Scalability | Auto-scale | Horizontal | Limited |
| Filtering | Yes | Yes | Yes |
| Hybrid Search | Yes | Yes (BM25) | Limited |
| APIs | REST, Python | GraphQL, REST | Python |
| Free Tier | Yes (small) | Yes (self-hosted) | Yes |
| ML Integration | Limited | Native | Via embedding fn |
Use Case Examples
Semantic Search
# Build semantic search with Pinecone
def semantic_search(query, index, top_k=5):
# Get embedding
query_embedding = get_embedding(query)
# Search
results = index.query(
vector=query_embedding,
top_k=top_k,
include_metadata=True
)
return [
{"text": r["metadata"]["text"], "score": 1 - r["score"]}
for r in results["matches"]
]
# Example
results = semantic_search("Find articles about AI", index)
for r in results:
print(f"{r['score']:.2f}: {r['text']}")
Recommendation System
# Product recommendations using vector similarity
def recommend_products(product_id, index, user_history=None):
# Get product embedding
product = get_product(product_id)
# Find similar products
similar = index.query(
vector=product.embedding,
top_k=10,
include_metadata=True
)
# Filter out already purchased
recommendations = [
r for r in similar["matches"]
if r["id"] not in user_history
]
return recommendations
# Collaborative filtering with vectors
def collaborative_recommend(user_id, user_embeddings, product_embeddings, index):
# User's preference vector
user_pref = user_embeddings[user_id]
# Find products matching preference
results = index.query(
vector=user_pref,
top_k=20,
filter={"category": {"$in": user_preferences[user_id]}
)
return results
RAG (Retrieval-Augmented Generation)
# RAG with vector database and LLM
class RAGSystem:
def __init__(self, vector_db, llm):
self.db = vector_db
self.llm = llm
def index_documents(self, documents):
for doc in documents:
embedding = get_embedding(doc.content)
self.db.upsert({
"id": doc.id,
"vector": embedding,
"metadata": {"content": doc.content}
})
def query(self, question, top_k=3):
# Retrieve relevant context
question_embedding = get_embedding(question)
results = self.db.query(
vector=question_embedding,
top_k=top_k
)
context = "\n".join([r["metadata"]["content"]
for r in results["matches"]])
# Generate answer
prompt = f"""Based on this context:
{context}
Answer this question: {question}
"""
return self.llm.generate(prompt)
Choosing a Vector Database
# Decision guide
def choose_vector_db(use_case, scale, budget):
if scale == "large" and budget == "high":
return "Pinecone" # Managed, scalable
if scale == "medium" and want_native_ml:
return "Weaviate" # Rich features
if scale == "small" or budget == "low":
return "Chroma" # Free, simple
if want_full_control:
return "Weaviate (self-hosted)"
if want_serverless:
return "Pinecone"
Performance Optimization
# Optimization tips
optimization_tips = {
"batch_operations": {
"tip": "Upsert in batches of 100-1000",
"benefit": "Reduce API calls, faster indexing"
},
"dimension_reduction": {
"tip": "Use PCA or SVD for high dimensions",
"benefit": "Reduce memory, faster search"
},
"quantization": {
"tip": "Use half-precision or product quantization",
"benefit": "Smaller index, lower cost"
},
"filtering": {
"tip": "Use pre-filtering for metadata",
"benefit": "Accurate results, faster"
},
"caching": {
"tip": "Cache frequently queried vectors",
"benefit": "Lower latency"
}
}
Conclusion
Vector databases are essential for AI applications:
- Pinecone: Best for managed, scalable production workloads
- Weaviate: Best for rich features and self-hosted options
- Chroma: Best for prototyping and small-scale applications
Choose based on scale, budget, and feature requirements.
Comments