Retrieval-Augmented Generation (RAG): Combining Search and Generation
RAG systems combine document retrieval with LLM generation to provide accurate, grounded answers. This guide covers building RAG applications.
RAG Fundamentals
Why RAG?
# LLM limitations
# - Hallucination: Generate false information
# - Knowledge cutoff: Training data has cutoff date
# - No access to real-time information
# - Cannot cite sources
# RAG benefits
# - Grounded answers: Based on retrieved documents
# - Up-to-date information: Can use recent documents
# - Verifiable: Can cite sources
# - Reduced hallucination: Constrained by retrieved context
RAG Architecture
# RAG Pipeline:
# 1. Query: User asks question
# 2. Retrieval: Find relevant documents
# 3. Ranking: Rank documents by relevance
# 4. Context: Combine top documents
# 5. Generation: LLM generates answer using context
# 6. Output: Return answer with sources
Building RAG Systems
Simple RAG with LangChain
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
# Load documents
loader = TextLoader('document.txt')
documents = loader.load()
# Split documents
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = splitter.split_documents(documents)
# Create embeddings and vector store
embeddings = OpenAIEmbeddings()
vector_store = FAISS.from_documents(docs, embeddings)
# Create retriever
retriever = vector_store.as_retriever(search_kwargs={"k": 3})
# Create QA chain
llm = OpenAI(temperature=0)
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=retriever,
return_source_documents=True
)
# Query
query = "What is machine learning?"
result = qa_chain({"query": query})
print(f"Answer: {result['result']}")
print(f"Sources: {result['source_documents']}")
Custom RAG Implementation
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from openai import OpenAI
class SimpleRAG:
def __init__(self, documents):
self.documents = documents
self.model = SentenceTransformer('all-MiniLM-L6-v2')
self.embeddings = self.model.encode(documents)
def retrieve(self, query, k=3):
"""Retrieve top-k relevant documents"""
query_embedding = self.model.encode(query)
similarities = cosine_similarity([query_embedding], self.embeddings)[0]
top_indices = np.argsort(similarities)[-k:][::-1]
return [self.documents[i] for i in top_indices]
def generate(self, query):
"""Generate answer using retrieved documents"""
# Retrieve relevant documents
relevant_docs = self.retrieve(query)
context = "\n".join(relevant_docs)
# Create prompt
prompt = f"""Based on the following context, answer the question.
Context:
{context}
Question: {query}
Answer:"""
# Generate answer
client = OpenAI()
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content
def query(self, question):
"""Full RAG pipeline"""
answer = self.generate(question)
return answer
# Usage
documents = [
"Machine learning is a subset of AI that enables systems to learn from data.",
"Deep learning uses neural networks with multiple layers.",
"Natural language processing deals with text and language."
]
rag = SimpleRAG(documents)
answer = rag.query("What is machine learning?")
print(answer)
Vector Databases
Using Pinecone
import pinecone
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
# Initialize Pinecone
pinecone.init(api_key="your-api-key", environment="us-west1-gcp")
# Create index
index_name = "rag-index"
if index_name not in pinecone.list_indexes():
pinecone.create_index(index_name, dimension=1536)
# Create embeddings
embeddings = OpenAIEmbeddings()
# Create vector store
vector_store = Pinecone.from_documents(
documents,
embeddings,
index_name=index_name
)
# Query
retriever = vector_store.as_retriever()
results = retriever.get_relevant_documents("machine learning")
Using Weaviate
import weaviate
from langchain.vectorstores import Weaviate
from langchain.embeddings import OpenAIEmbeddings
# Connect to Weaviate
client = weaviate.Client("http://localhost:8080")
# Create embeddings
embeddings = OpenAIEmbeddings()
# Create vector store
vector_store = Weaviate.from_documents(
documents,
embeddings,
client=client,
by_text=False
)
# Query
retriever = vector_store.as_retriever()
results = retriever.get_relevant_documents("machine learning")
Using Chroma
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
# Create embeddings
embeddings = OpenAIEmbeddings()
# Create vector store (persists to disk)
vector_store = Chroma.from_documents(
documents,
embeddings,
persist_directory="./chroma_db"
)
# Load existing vector store
vector_store = Chroma(
persist_directory="./chroma_db",
embedding_function=embeddings
)
# Query
retriever = vector_store.as_retriever(search_kwargs={"k": 5})
results = retriever.get_relevant_documents("machine learning")
Advanced RAG Techniques
Hybrid Search (BM25 + Semantic)
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
# Create BM25 retriever
bm25_retriever = BM25Retriever.from_documents(documents)
# Create semantic retriever
embeddings = OpenAIEmbeddings()
faiss_vectorstore = FAISS.from_documents(documents, embeddings)
semantic_retriever = faiss_vectorstore.as_retriever()
# Combine retrievers
ensemble_retriever = EnsembleRetriever(
retrievers=[bm25_retriever, semantic_retriever],
weights=[0.5, 0.5]
)
# Query
results = ensemble_retriever.get_relevant_documents("machine learning")
Multi-Query Retrieval
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.llms import OpenAI
# Create multi-query retriever
llm = OpenAI(temperature=0)
retriever = MultiQueryRetriever.from_llm(
retriever=vector_store.as_retriever(),
llm=llm
)
# Query (generates multiple queries internally)
results = retriever.get_relevant_documents("machine learning")
Contextual Compression
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMCompressor
from langchain.llms import OpenAI
# Create compressor
llm = OpenAI(temperature=0)
compressor = LLMCompressor.from_llm(llm)
# Create compression retriever
compression_retriever = ContextualCompressionRetriever(
base_compressor=compressor,
base_retriever=vector_store.as_retriever()
)
# Query (returns compressed, relevant documents)
results = compression_retriever.get_relevant_documents("machine learning")
RAG Evaluation
from datasets import load_dataset
from langchain.evaluation import QAEvalChain
from langchain.llms import OpenAI
# Load evaluation dataset
eval_dataset = load_dataset('squad', split='validation[:100]')
# Create evaluation chain
llm = OpenAI(temperature=0)
eval_chain = QAEvalChain.from_llm(llm)
# Evaluate
predictions = []
for example in eval_dataset:
result = qa_chain({"query": example['question']})
predictions.append({
"query": example['question'],
"answer": result['result'],
"expected": example['answers']['text'][0]
})
# Score predictions
scores = eval_chain.evaluate(
eval_dataset,
predictions,
question_key="question",
answer_key="answers",
prediction_key="answer"
)
print(f"Average score: {np.mean(scores):.4f}")
Production RAG System
from fastapi import FastAPI
from pydantic import BaseModel
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
app = FastAPI()
# Load RAG system
embeddings = OpenAIEmbeddings()
vector_store = FAISS.load_local("./vector_store", embeddings)
retriever = vector_store.as_retriever()
llm = OpenAI(temperature=0)
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=retriever
)
class Query(BaseModel):
question: str
@app.post("/query")
async def query(q: Query):
result = qa_chain({"query": q.question})
return {
"answer": result['result'],
"sources": [doc.metadata for doc in result['source_documents']]
}
# Run: uvicorn app:app --reload
Best Practices
- Chunk size: Balance between context and relevance (500-1000 tokens)
- Overlap: Use chunk overlap to maintain context (100-200 tokens)
- Retrieval count: Retrieve 3-5 documents for generation
- Reranking: Rerank retrieved documents for better relevance
- Caching: Cache embeddings and retrievals
- Monitoring: Track retrieval quality and generation accuracy
- Updates: Regularly update document collection
Common Pitfalls
Bad Practice:
# Don't: Large chunks without overlap
splitter = CharacterTextSplitter(chunk_size=5000)
# Don't: Retrieve too many documents
retriever = vector_store.as_retriever(search_kwargs={"k": 20})
# Don't: No evaluation
# Deploy without testing retrieval quality
Good Practice:
# Do: Appropriate chunk size with overlap
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
# Do: Retrieve optimal number
retriever = vector_store.as_retriever(search_kwargs={"k": 3})
# Do: Evaluate system
# Test retrieval quality and generation accuracy
Conclusion
RAG systems combine retrieval and generation to provide accurate, grounded answers. Master document chunking, vector search, and prompt engineering to build effective RAG applications. Use appropriate vector databases and evaluation metrics to ensure quality.
Comments