PostgreSQL for AI: Vector Search, ML Integration, and RAG

Introduction

PostgreSQL has emerged as a powerful platform for AI applications, particularly with the pgvector extension enabling vector similarity search. Combined with its existing features for JSON, full-text search, and reliability, PostgreSQL provides an excellent foundation for building AI-powered applications in 2026.

pgvector: Vector Similarity Search

Installation

# From source
git clone https://github.com/pgvector/pgvector.git
cd pgvector
make
make install

# Or use Docker with pgvector
docker run -d --name pgvector \
  -e POSTGRES_PASSWORD=password \
  -e POSTGRES_DB=vector_db \
  -p 5432:5432 \
  pgvector/pgvector:pg17

Enable Extension

-- Enable pgvector
CREATE EXTENSION vector;

-- Verify installation
SELECT * FROM pg_extension WHERE extname = 'vector';

-- Check available functions
\df *vector*

Creating and Managing Vector Data

Vector Data Types

-- Create table with vector column
CREATE TABLE documents (
    id SERIAL PRIMARY KEY,
    title VARCHAR(500),
    content TEXT,
    embedding VECTOR(1536),  -- OpenAI ada-002
    metadata JSONB,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

-- Vector dimensions for different models:
-- OpenAI ada-002: 1536
-- OpenAI text-embedding-3-small: 1536
-- OpenAI text-embedding-3-large: 3072
-- sentence-transformers all-MiniLM-L6-v2: 384
-- sentence-transformers all-mpnet-base-v2: 768

-- Alternative: Half precision (saves storage)
CREATE TABLE documents_half (
    id SERIAL PRIMARY KEY,
    embedding VECTOR(1536)  -- Can store as half precision
);

Inserting Vectors

-- Insert a single vector
INSERT INTO documents (title, content, embedding)
VALUES (
    'PostgreSQL Tutorial',
    'Learn PostgreSQL from basics to advanced',
    '[0.1, 0.2, 0.3, 0.4, 0.5]'::vector
);

-- Insert multiple vectors
INSERT INTO documents (title, content, embedding)
VALUES 
    ('Redis Guide', 'In-memory data structure store', '[0.2, 0.1, 0.4, 0.3, 0.5]'::vector),
    ('MongoDB Basics', 'NoSQL document database', '[0.3, 0.2, 0.1, 0.5, 0.4]'::vector);

-- Generate embeddings programmatically
-- Using Python with OpenAI

Python: Generate and Store Embeddings

import psycopg2
import openai

# Connect to PostgreSQL
conn = psycopg2.connect(
    host="localhost",
    database="vector_db",
    user="postgres",
    password="password"
)

# Generate embedding
def get_embedding(text):
    response = openai.Embedding.create(
        model="text-embedding-3-small",
        input=text
    )
    return response['data'][0]['embedding']

# Store document with embedding
def index_document(title, content, metadata=None):
    embedding = get_embedding(content)
    
    with conn.cursor() as cur:
        cur.execute("""
            INSERT INTO documents (title, content, embedding, metadata)
            VALUES (%s, %s, %s, %s)
            RETURNING id
        """, (title, content, embedding, metadata))
        conn.commit()
        return cur.fetchone()[0]

# Batch index documents
def index_documents_batch(documents):
    with conn.cursor() as cur:
        for doc in documents:
            embedding = get_embedding(doc['content'])
            cur.execute("""
                INSERT INTO documents (title, content, embedding, metadata)
                VALUES (%s, %s, %s, %s)
            """, (doc['title'], doc['content'], embedding, doc.get('metadata')))
        conn.commit()

Vector Indexes

Index Types

-- HNSW Index (Hierarchical Navigable Small World)
-- Fast build, fast query, more memory
CREATE INDEX idx_documents_embedding_hnsw 
ON documents 
USING HNSW (embedding vector_cosine_ops)
WITH (m = 16, ef_construction = 64);

-- Parameters:
-- m: Number of connections per layer (default 16)
-- ef_construction: Search width during build (default 64)
-- Higher values = better recall, slower build, more memory

-- IVFFlat Index (Inverted File with Flat vectors)
-- Faster build, slower query, less memory
CREATE INDEX idx_documents_embedding_ivf 
ON documents 
USING IVFFlat (embedding vector_cosine_ops)
WITH (lists = 100);

-- Parameters:
-- lists: Number of clusters (100 for < 1M rows, 300 for 1-10M)
-- Higher values = better recall, slower build

Index Comparison

Index Type	Build Time	Query Speed	Memory	Best For
HNSW	Fast	Very Fast	High	Real-time applications
IVFFlat	Fast	Fast	Medium	Large datasets
None (raw)	N/A	Slow	None	Small datasets, testing

Index Best Practices

-- Choose distance metric:
-- vector_cosine_ops: Cosine distance (recommended)
-- vector_l2_ops: Euclidean distance
-- vector_ip_ops: Inner product

-- Create index after inserting data
-- Partial index for filtered searches
CREATE INDEX idx_documents_embedding_active 
ON documents USING HNSW (embedding vector_cosine_ops)
WHERE status = 'active';

-- Check index usage
EXPLAIN ANALYZE 
SELECT * FROM documents 
ORDER BY embedding <=> '[0.1,0.2,0.3]'::vector 
LIMIT 5;

Similarity Search

Vector Search Queries

-- Cosine similarity (most common)
SELECT id, title, 
       1 - (embedding <=> $query_embedding) AS similarity
FROM documents
ORDER BY embedding <=> $query_embedding
LIMIT 5;

-- Euclidean distance
SELECT id, title,
       embedding <-> $query_embedding AS distance
FROM documents
ORDER BY embedding <-> $query_embedding
LIMIT 5;

-- Inner product
SELECT id, title,
       embedding <#> $query_embedding AS similarity
FROM documents
ORDER BY embedding <#> $query_embedding
LIMIT 5;

Filtered Vector Search

-- Add filters to vector search
SELECT id, title, 
       1 - (embedding <=> $query_embedding) AS similarity
FROM documents
WHERE status = 'published'
  AND created_at > '2025-01-01'
ORDER BY embedding <=> $query_embedding
LIMIT 10;

-- Use pre-filter for small result sets
SELECT * FROM (
    SELECT id, title, 
           1 - (embedding <=> $query_embedding) AS similarity
    FROM documents
    WHERE category = 'tutorial'
) AS filtered
ORDER BY similarity DESC
LIMIT 5;

-- Post-filter with complex conditions
SELECT id, title, similarity
FROM documents,
     LATERAL (
         SELECT 1 - (embedding <=> $query_embedding) AS similarity
     ) AS sim
WHERE (metadata->>'author') = 'John'
ORDER BY similarity DESC
LIMIT 5;

RAG Pipeline Implementation

Complete RAG Architecture

from openai import OpenAI
import psycopg2
from sentence_transformers import SentenceTransformer

class RAGPipeline:
    def __init__(self, db_conn, openai_key):
        self.conn = db_conn
        self.client = OpenAI(api_key=openai_key)
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
    
    def generate_embedding(self, text):
        """Generate embedding using sentence-transformers"""
        return self.embedding_model.encode(text).tolist()
    
    def retrieve_context(self, query, top_k=5, filters=None):
        """Retrieve relevant documents"""
        query_embedding = self.generate_embedding(query)
        
        # Build SQL query
        sql = """
            SELECT id, title, content, metadata,
                   1 - (embedding <=> %s) AS similarity
            FROM documents
            WHERE 1=1
        """
        params = [query_embedding]
        
        if filters:
            for key, value in filters.items():
                sql += f" AND metadata->>'{key}' = %s"
                params.append(value)
        
        sql += f" ORDER BY embedding <=> %s LIMIT {top_k}"
        params.append(query_embedding)
        
        with self.conn.cursor() as cur:
            cur.execute(sql, params)
            return cur.fetchall()
    
    def generate_response(self, query, context_documents):
        """Generate response with retrieved context"""
        # Build context from documents
        context = "\n\n".join([
            f"Document {i+1}: {doc[2]}"
            for i, doc in enumerate(context_documents)
        ])
        
        # Create prompt
        prompt = f"""Based on the following context, answer the question.

Context:
{context}

Question: {query}

Answer:"""
        
        # Generate response
        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ]
        )
        
        return response.choices[0].message.content
    
    def rag_query(self, query, top_k=5, filters=None):
        """Complete RAG: retrieve + generate"""
        docs = self.retrieve_context(query, top_k, filters)
        answer = self.generate_response(query, docs)
        
        return {
            'answer': answer,
            'sources': [
                {'id': d[0], 'title': d[1], 'content': d[2][:200], 'similarity': float(d[4])}
                for d in docs
            ]
        }

RAG Pipeline Usage

# Initialize pipeline
rag = RAGPipeline(conn, openai_key="sk-...")

# Index documents
documents = [
    {
        'title': 'PostgreSQL Basics',
        'content': 'PostgreSQL is a powerful, open source object-relational database...',
        'metadata': {'category': 'database', 'author': 'John'}
    },
    {
        'title': 'Advanced SQL',
        'content': 'Learn advanced SQL techniques including window functions...',
        'metadata': {'category': 'database', 'author': 'Jane'}
    }
]

for doc in documents:
    embedding = rag.generate_embedding(doc['content'])
    with conn.cursor() as cur:
        cur.execute("""
            INSERT INTO documents (title, content, embedding, metadata)
            VALUES (%s, %s, %s, %s)
        """, (doc['title'], doc['content'], embedding, doc['metadata']))
    conn.commit()

# Query with RAG
result = rag.rag_query(
    query="What is PostgreSQL?",
    top_k=3,
    filters={'category': 'database'}
)

print(result['answer'])
print("Sources:", result['sources'])

Hybrid Search

Combining Vector and Full-Text Search

-- Create full-text search index
ALTER TABLE documents ADD COLUMN search_vector tsvector
    GENERATED ALWAYS AS (to_tsvector('english', title || ' ' || content)) STORED;

CREATE INDEX idx_documents_search ON documents USING GIN (search_vector);

-- Hybrid search with ranking
SELECT 
    id,
    title,
    (
        0.7 * (1 - (embedding <=> $query_embedding))) + 
        0.3 * ts_rank(search_vector, query_fts)
    ) AS combined_rank
FROM documents,
     to_tsquery('english', 'postgresql') query_fts
WHERE search_vector @@ query_fts
   OR embedding <=> $query_embedding < 0.5
ORDER BY combined_rank DESC
LIMIT 10;

Python Hybrid Search

def hybrid_search(query, top_k=10):
    # Generate query embedding
    query_embedding = get_embedding(query)
    
    # Full-text search
    fts_query = " | ".join(query.split())
    
    with conn.cursor() as cur:
        cur.execute("""
            SELECT 
                id,
                title,
                content,
                ts_rank(to_tsvector('english', title || ' ' || content), 
                        plainto_tsquery('english', %s)) AS fts_rank,
                1 - (embedding <=> %s) AS vector_rank
            FROM documents
            WHERE to_tsvector('english', title || ' ' || content) 
                  @@ plainto_tsquery('english', %s)
               OR embedding <=> %s < 0.8
            ORDER BY (0.5 * fts_rank + 0.5 * (1 - (embedding <=> %s))) DESC
            LIMIT %s
        """, (query, query_embedding, query, query_embedding, query_embedding, top_k))
        
        return cur.fetchall()

Session Management for LLM Applications

class LLMSessionManager:
    def __init__(self, conn):
        self.conn = conn
    
    def create_session(self, user_id):
        """Create new chat session"""
        with self.conn.cursor() as cur:
            cur.execute("""
                INSERT INTO chat_sessions (user_id, messages)
                VALUES (%s, '[]'::jsonb)
                RETURNING session_id
            """, (user_id,))
            self.conn.commit()
            return cur.fetchone()[0]
    
    def add_message(self, session_id, role, content):
        """Add message to session"""
        with self.conn.cursor() as cur:
            cur.execute("""
                UPDATE chat_sessions
                SET messages = messages || %s::jsonb,
                    updated_at = NOW()
                WHERE session_id = %s
            """, ([{"role": role, "content": content, "timestamp": str(now())}], session_id))
            self.conn.commit()
    
    def get_context(self, session_id, max_tokens=4000):
        """Get conversation context"""
        with self.conn.cursor() as cur:
            cur.execute("""
                SELECT messages
                FROM chat_sessions
                WHERE session_id = %s
            """, (session_id,))
            result = cur.fetchone()
            if not result:
                return []
            
            messages = result[0]
            # Trim to token limit
            return messages[-20:]  # Last 20 messages
    
    def get_recent_sessions(self, user_id, limit=10):
        """Get recent sessions for user"""
        with self.conn.cursor() as cur:
            cur.execute("""
                SELECT session_id, created_at, messages
                FROM chat_sessions
                WHERE user_id = %s
                ORDER BY updated_at DESC
                LIMIT %s
            """, (user_id, limit))
            return cur.fetchall()

# Create session table
"""
CREATE TABLE chat_sessions (
    session_id SERIAL PRIMARY KEY,
    user_id INTEGER REFERENCES users(id),
    messages JSONB DEFAULT '[]',
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
"""

Feature Store for ML

Storing Precomputed Features

class FeatureStore:
    def __init__(self, conn):
        self.conn = conn
    
    def store_features(self, entity_type, entity_id, features):
        """Store precomputed ML features"""
        import json
        import jsonb
        
        with self.conn.cursor() as cur:
            cur.execute("""
                INSERT INTO ml_features (entity_type, entity_id, features)
                VALUES (%s, %s, %s)
                ON CONFLICT (entity_type, entity_id)
                DO UPDATE SET features = EXCLUDED.features,
                             updated_at = NOW()
            """, (entity_type, entity_id, json.dumps(features)))
            self.conn.commit()
    
    def get_features(self, entity_type, entity_id):
        """Retrieve features for ML inference"""
        import json
        
        with self.conn.cursor() as cur:
            cur.execute("""
                SELECT features
                FROM ml_features
                WHERE entity_type = %s AND entity_id = %s
            """, (entity_type, entity_id))
            result = cur.fetchone()
            return json.loads(result[0]) if result else None
    
    def batch_get(self, entity_type, entity_ids):
        """Batch retrieve features"""
        import json
        
        with self.conn.cursor() as cur:
            cur.execute("""
                SELECT entity_id, features
                FROM ml_features
                WHERE entity_type = %s AND entity_id = ANY(%s)
            """, (entity_type, entity_ids))
            
            return {
                row[0]: json.loads(row[1]) 
                for row in cur.fetchall()
            }

# Create feature store table
"""
CREATE TABLE ml_features (
    id SERIAL PRIMARY KEY,
    entity_type VARCHAR(50) NOT NULL,
    entity_id BIGINT NOT NULL,
    features JSONB NOT NULL,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    UNIQUE(entity_type, entity_id)
);

CREATE INDEX idx_ml_features_entity ON ml_features (entity_type, entity_id);
"""

Semantic Caching

class SemanticCache:
    def __init__(self, conn, ttl=3600):
        self.conn = conn
        self.ttl = ttl
    
    def get_cached_response(self, query_embedding, threshold=0.95):
        """Find similar cached query"""
        with self.conn.cursor() as cur:
            cur.execute("""
                SELECT response, 1 - (query_embedding <=> %s) AS similarity
                FROM query_cache
                WHERE query_embedding <=> %s < %s
                ORDER BY query_embedding <=> %s
                LIMIT 1
            """, (query_embedding, query_embedding, 1-threshold, query_embedding))
            return cur.fetchone()
    
    def cache_response(self, query, query_embedding, response):
        """Cache query and response"""
        with self.conn.cursor() as cur:
            cur.execute("""
                INSERT INTO query_cache (query, query_embedding, response)
                VALUES (%s, %s, %s)
            """, (query, query_embedding, response))
            self.conn.commit()

# Create cache table
"""
CREATE TABLE query_cache (
    id SERIAL PRIMARY KEY,
    query TEXT NOT NULL,
    query_embedding VECTOR(1536),
    response TEXT NOT NULL,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

CREATE INDEX idx_query_cache_embedding 
ON query_cache 
USING HNSW (query_embedding vector_cosine_ops);
"""

Best Practices

Performance Optimization

-- 1. Use appropriate index type
-- HNSW for real-time applications
-- IVFFlat for very large datasets

-- 2. Batch inserts
INSERT INTO documents (title, content, embedding)
SELECT title, content, embedding FROM temp_table;

-- 3. Monitor query performance
EXPLAIN ANALYZE 
SELECT * FROM documents 
ORDER BY embedding <=> $embedding 
LIMIT 10;

-- 4. Partition large tables
CREATE TABLE documents (
    id SERIAL,
    embedding VECTOR(1536),
    -- columns
) PARTITION BY RANGE (created_at);

-- 5. Use connection pooling
-- PgBouncer for better performance

Security

-- 1. Enable row-level security
ALTER TABLE documents ENABLE ROW LEVEL SECURITY;

-- 2. Create policies
CREATE POLICY document_access_policy ON documents
    FOR SELECT USING (owner_id = current_user_id());

-- 3. Use separate users for different access levels
CREATE ROLE app_readonly;
GRANT SELECT ON documents TO app_readonly;

-- 4. Encrypt sensitive data
-- Use pgcrypto extension
CREATE EXTENSION pgcrypto;

-- Encrypt before storing
INSERT INTO documents (content, encrypted_embedding)
VALUES ('sensitive', pgp_sym_encrypt(%s, 'key'));

Resources

Conclusion

PostgreSQL with pgvector provides a powerful, cost-effective platform for AI applications. From vector similarity search to complete RAG pipelines, session management to feature stores, PostgreSQL can handle the data requirements of modern AI applications while leveraging its existing reliability and ecosystem.

In the final article, we’ll explore real-world production use cases for PostgreSQL across different industries and application types.