Skip to main content
โšก Calmops

Vector Databases: Pinecone, Weaviate, Chroma, and Beyond

Vector databases have emerged as a critical infrastructure for AI applications. They enable efficient similarity search over high-dimensional embeddings, powering applications from semantic search to recommendation systems.

In this guide, we’ll explore vector databases, how they work, and compare leading solutions.

Understanding Vector Embeddings

What Are Embeddings?

# Text embeddings convert text to numerical vectors
# that capture semantic meaning

# Example: Word2Vec style embeddings
word_embeddings = {
    "king": [0.9, 0.1, 0.3, ...],
    "queen": [0.85, 0.12, 0.28, ...],
    "man": [0.7, 0.2, 0.4, ...],
    "woman": [0.68, 0.22, 0.38, ...],
    "apple": [0.1, 0.8, 0.2, ...],
    "orange": [0.12, 0.78, 0.18, ...]
}

# Similar words have similar vectors
# king - man + woman โ‰ˆ queen

Creating Embeddings

# Using OpenAI's text-embedding-ada-002
import openai

def get_embedding(text):
    response = openai.Embedding.create(
        model="text-embedding-ada-002",
        input=text
    )
    return response['data'][0]['embedding']

# Example embeddings
texts = [
    "The cat sat on the mat",
    "A feline resting on a rug",
    "The dog played in the park",
    "Machine learning is AI"
]

embeddings = [get_embedding(text) for text in texts]
print(f"Embedding dimension: {len(embeddings[0])}")  # 1536

# Similarity calculation
import numpy as np

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# "cat" and "feline" should be similar
print(cosine_similarity(embeddings[0], embeddings[1]))  # High similarity
print(cosine_similarity(embeddings[0], embeddings[2]))  # Low similarity

Vector Database Concepts

Similarity Metrics

# Different distance/similarity metrics

similarity_metrics = {
    "cosine_similarity": {
        "description": "Angle between vectors",
        "best_for": "Text embeddings, varying lengths",
        "formula": "cos(ฮธ) = (AยทB) / (||A|| ร— ||B||)"
    },
    
    "euclidean_distance": {
        "description": "Straight-line distance",
        "best_for": "Dense numerical data",
        "formula": "โˆš(ฮฃ(Ai - Bi)ยฒ)"
    },
    
    "dot_product": {
        "description": "Projection of one vector onto another",
        "best_for": "Normalized embeddings",
        "formula": "ฮฃ(Ai ร— Bi)"
    },
    
    "manhattan_distance": {
        "description": "Sum of absolute differences",
        "best_for": "High-dimensional sparse data",
        "formula": "ฮฃ|Ai - Bi|"
    }
}

Indexing Methods

# Vector indexing for fast similarity search

indexing_methods = {
    "brute_force": {
        "description": "Compare against all vectors",
        "complexity": "O(n ร— d)",
        "accuracy": "100%",
        "use_when": "Small datasets (<10K)"
    },
    
    "hnsw": {
        "description": "Hierarchical Navigable Small World",
        "complexity": "O(log n)",
        "accuracy": "~95-99%",
        "use_when": "General purpose, balanced"
    },
    
    "ivf": {
        "description": "Inverted File Index",
        "complexity": "O(log n)",
        "accuracy": "~90-95%",
        "use_when": "Very large datasets"
    },
    
    "pq": {
        "description": "Product Quantization",
        "complexity": "O(1) lookup",
        "accuracy": "~80-90%",
        "use_when": "Memory constrained"
    }
}

Pinecone

Overview

# Pinecone Characteristics

type: "Managed vector database"
deployment: "Cloud (AWS, GCP, Azure)"
pricing: "Free tier, then pay-per-use"
index_type: "Proprietary (SOTA)"
scalability: "Serverless, auto-scaling"

Python Client

import pinecone

# Initialize
pinecone.init(api_key="YOUR_API_KEY", environment="us-west1-gcp")

# Create index
pinecone.create_index(
    name="semantic-search",
    dimension=1536,
    metric="cosine",
    pod_type="p1"
)

# Connect to index
index = pinecone.Index("semantic-search")

# Upsert vectors
vectors = [
    {"id": "vec1", "values": [0.1] * 1536, "metadata": {"text": "Hello world"}},
    {"id": "vec2", "values": [0.2] * 1536, "metadata": {"text": "Goodbye world"}},
    {"id": "vec3", "values": [0.3] * 1536, "metadata": {"text": "Hello there"}}
]

index.upsert(vectors)

# Query
query_vector = [0.15] * 1536
results = index.query(
    vector=query_vector,
    top_k=2,
    include_metadata=True
)

print(results)

# Delete
index.delete(ids=["vec1"])

Use Cases

pinecone_use_cases = {
    "semantic_search": {
        "description": "Search by meaning, not keywords",
        "example": "E-commerce product search"
    },
    
    "recommendations": {
        "description": "Find similar items",
        "example": "Movie recommendations"
    },
    
    "anomaly_detection": {
        "description": "Find unusual patterns",
        "example": "Fraud detection"
    },
    
    "classification": {
        "description": "KNN-based classification",
        "example": "Document categorization"
    }
}

Weaviate

Overview

# Weaviate Characteristics

type: "Open-source vector database"
deployment: "Self-hosted, cloud (Weaviate Cloud)"
index_type: "Graph-based, HNSW"
features: "Native ML models, filters, CRUD"

Python Client

import weaviate
from weaviate import EmbeddedOptions

# Connect to local Weaviate
client = weaviate.Client(
    url="http://localhost:8080",
    additional_headers={
        "X-OpenAI-Api-Key": "YOUR_OPENAI_KEY"
    }
)

# Define schema
schema = {
    "class": "Article",
    "description": "News articles",
    "vectorizer": "text2vec-transformers",
    "moduleConfig": {
        "text2vec-transformers": {
            "vectorizeClassName": False
        }
    },
    "properties": [
        {"name": "title", "dataType": ["text"]},
        {"name": "content", "dataType": ["text"]},
        {"name": "category", "dataType": ["text"]}
    ]
}

client.schema.create_class(schema)

# Add data
data_object = {
    "title": "AI Revolution",
    "content": "Machine learning is transforming...",
    "category": "technology"
}

client.data_object.create(
    data_object,
    class_name="Article"
)

# Search
result = client.query.get(
    "Article",
    ["title", "content", "category"]
).with_near_text({
    "concepts": ["artificial intelligence"]
}).with_limit(5).do()

print(result)

# With custom vectors
client.data_object.create(
    data_object={"title": "Custom"},
    class_name="Article",
    vector=[0.1] * 1536  # Custom embedding
)
# Weaviate supports hybrid search (BM25 + vector)

result = client.query.get(
    "Article",
    ["title", "content"]
).with_hybrid(
    query="artificial intelligence",
    alpha=0.5  # 0 = keyword, 1 = vector
).with_limit(5).do()

Chroma

Overview

# Chroma Characteristics

type: "Open-source, embedded"
deployment: "In-memory, local, client-side"
index_type: "HNSW (via hnswlib)"
features: "Simple API, lightweight"

Python Client

import chromadb
from chromadb.config import Settings

# Initialize (in-memory)
client = chromadb.Client(Settings(
    anonymized_telemetry=False,
    allow_reset=True
))

# Create collection
collection = client.create_collection(
    name="documents",
    metadata={"hnsw:space": "cosine"}
)

# Add documents
collection.add(
    documents=[
        "The cat sat on the mat",
        "A feline resting on a rug",
        "The dog played in the park",
        "Machine learning is AI"
    ],
    ids=["doc1", "doc2", "doc3", "doc4"],
    metadatas=[
        {"source": "book1", "category": "animals"},
        {"source": "book1", "category": "animals"},
        {"source": "book2", "category": "animals"},
        {"source": "book3", "category": "tech"}
    ]
)

# Query
results = collection.query(
    query_texts=["cat resting"],
    n_results=2,
    where={"category": "animals"},  # Filter
    include=["documents", "distances"]
)

print(results)

# Query by vector
results = collection.query(
    query_embeddings=[[0.1] * 768],
    n_results=2
)

Persistence

# Persistent storage
client = chromadb.PersistentClient(
    path="./chroma_data"
)

# With embedding function
from chromadb.utils import embedding_functions

openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key="YOUR_KEY",
    model_name="text-embedding-ada-002"
)

collection = client.create_collection(
    name="documents",
    embedding_function=openai_ef
)

Comparison Matrix

Feature Pinecone Weaviate Chroma
Type Managed Open-source Open-source
Deployment Cloud only Self-hosted + Cloud Embedded
Scalability Auto-scale Horizontal Limited
Filtering Yes Yes Yes
Hybrid Search Yes Yes (BM25) Limited
APIs REST, Python GraphQL, REST Python
Free Tier Yes (small) Yes (self-hosted) Yes
ML Integration Limited Native Via embedding fn

Use Case Examples

# Build semantic search with Pinecone

def semantic_search(query, index, top_k=5):
    # Get embedding
    query_embedding = get_embedding(query)
    
    # Search
    results = index.query(
        vector=query_embedding,
        top_k=top_k,
        include_metadata=True
    )
    
    return [
        {"text": r["metadata"]["text"], "score": 1 - r["score"]}
        for r in results["matches"]
    ]

# Example
results = semantic_search("Find articles about AI", index)
for r in results:
    print(f"{r['score']:.2f}: {r['text']}")

Recommendation System

# Product recommendations using vector similarity

def recommend_products(product_id, index, user_history=None):
    # Get product embedding
    product = get_product(product_id)
    
    # Find similar products
    similar = index.query(
        vector=product.embedding,
        top_k=10,
        include_metadata=True
    )
    
    # Filter out already purchased
    recommendations = [
        r for r in similar["matches"]
        if r["id"] not in user_history
    ]
    
    return recommendations

# Collaborative filtering with vectors
def collaborative_recommend(user_id, user_embeddings, product_embeddings, index):
    # User's preference vector
    user_pref = user_embeddings[user_id]
    
    # Find products matching preference
    results = index.query(
        vector=user_pref,
        top_k=20,
        filter={"category": {"$in": user_preferences[user_id]}
    )
    
    return results

RAG (Retrieval-Augmented Generation)

# RAG with vector database and LLM

class RAGSystem:
    def __init__(self, vector_db, llm):
        self.db = vector_db
        self.llm = llm
    
    def index_documents(self, documents):
        for doc in documents:
            embedding = get_embedding(doc.content)
            self.db.upsert({
                "id": doc.id,
                "vector": embedding,
                "metadata": {"content": doc.content}
            })
    
    def query(self, question, top_k=3):
        # Retrieve relevant context
        question_embedding = get_embedding(question)
        results = self.db.query(
            vector=question_embedding,
            top_k=top_k
        )
        
        context = "\n".join([r["metadata"]["content"] 
                           for r in results["matches"]])
        
        # Generate answer
        prompt = f"""Based on this context:
{context}

Answer this question: {question}
"""
        
        return self.llm.generate(prompt)

Choosing a Vector Database

# Decision guide

def choose_vector_db(use_case, scale, budget):
    if scale == "large" and budget == "high":
        return "Pinecone"  # Managed, scalable
    
    if scale == "medium" and want_native_ml:
        return "Weaviate"  # Rich features
    
    if scale == "small" or budget == "low":
        return "Chroma"  # Free, simple
    
    if want_full_control:
        return "Weaviate (self-hosted)"
    
    if want_serverless:
        return "Pinecone"

Performance Optimization

# Optimization tips

optimization_tips = {
    "batch_operations": {
        "tip": "Upsert in batches of 100-1000",
        "benefit": "Reduce API calls, faster indexing"
    },
    
    "dimension_reduction": {
        "tip": "Use PCA or SVD for high dimensions",
        "benefit": "Reduce memory, faster search"
    },
    
    "quantization": {
        "tip": "Use half-precision or product quantization",
        "benefit": "Smaller index, lower cost"
    },
    
    "filtering": {
        "tip": "Use pre-filtering for metadata",
        "benefit": "Accurate results, faster"
    },
    
    "caching": {
        "tip": "Cache frequently queried vectors",
        "benefit": "Lower latency"
    }
}

Conclusion

Vector databases are essential for AI applications:

  • Pinecone: Best for managed, scalable production workloads
  • Weaviate: Best for rich features and self-hosted options
  • Chroma: Best for prototyping and small-scale applications

Choose based on scale, budget, and feature requirements.


Comments