Advanced Retrieval Strategies
Advanced Retrieval Strategies
Simple similarity search finds relevant documents, but doesn’t handle every case well. Complex queries with multiple topics need sophisticated retrieval. This lesson covers hybrid search, reranking, query decomposition, HyDE, and contextual compression—techniques that significantly improve RAG quality.
Hybrid Search: Semantic + Keyword
Semantic search (embeddings) is great for meaning but misses exact keywords. Keyword search is literal but doesn’t understand paraphrasing. Hybrid search combines both:
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
class HybridRetrieverSystem:
"""Combine semantic and keyword retrieval."""
def __init__(self, docs_path: str):
# Load and split documents
loader = TextLoader(docs_path)
docs = loader.load()
splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
split_docs = splitter.split_documents(docs)
# Semantic retriever (embeddings)
embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(split_docs, embeddings)
semantic_retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
# Keyword retriever (BM25)
keyword_retriever = BM25Retriever.from_documents(split_docs)
# Ensemble: run both, combine results
self.ensemble = EnsembleRetriever(
retrievers=[semantic_retriever, keyword_retriever],
weights=[0.5, 0.5]
)
def retrieve(self, query: str):
"""Retrieve using hybrid approach."""
return self.ensemble.get_relevant_documents(query)
# Usage
hybrid = HybridRetrieverSystem("documents.txt")
results = hybrid.retrieve("API rate limiting") # Finds both semantic matches and "API"/"rate"
Reranking: Refine Results
Retrieve many documents, then rerank them with a more sophisticated model:
from langchain_cohere import CohereRerank
from langchain.retrievers import ContextualCompressionRetriever
class RerankerSystem:
"""Use a reranker to improve retrieval quality."""
def __init__(self, base_retriever):
# Rerank using Cohere (more expensive but better quality)
compressor = CohereRerank(top_n=3)
self.retriever = ContextualCompressionRetriever(
base_compressor=compressor,
base_retriever=base_retriever
)
def retrieve(self, query: str):
"""Retrieve with reranking."""
return self.retriever.get_relevant_documents(query)
# Usage
from langchain_community.vectorstores import Chroma
vectorstore = Chroma(...)
base_retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
reranker = RerankerSystem(base_retriever)
top_3 = reranker.retrieve("query") # Retrieves 10, reranks to top 3
Query Decomposition
Complex queries are easier to answer by breaking them into sub-questions:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
class QueryDecomposer:
"""Break complex queries into sub-questions."""
def __init__(self):
self.model = ChatOpenAI()
def decompose(self, query: str) -> list[str]:
"""Break query into sub-questions."""
prompt = ChatPromptTemplate.from_template("""
Break this complex question into 3-5 simpler sub-questions.
Return each on a new line, starting with a number.
Question: {query}
Sub-questions:""")
response = self.model.invoke(
prompt.format_messages(query=query)
)
# Parse response to extract sub-questions
lines = response.content.split('\n')
sub_questions = [
line.strip().lstrip('123456789. ')
for line in lines
if line.strip()
]
return sub_questions
class MultiQueryRAG:
"""RAG that answers complex questions via decomposition."""
def __init__(self, retriever):
self.decomposer = QueryDecomposer()
self.retriever = retriever
def retrieve_for_complex_query(self, query: str):
"""Retrieve documents for each sub-question."""
sub_questions = self.decomposer.decompose(query)
all_docs = set()
for sub_q in sub_questions:
docs = self.retriever.get_relevant_documents(sub_q)
all_docs.update([doc.page_content for doc in docs])
return list(all_docs)
# Usage
rag = MultiQueryRAG(base_retriever)
docs = rag.retrieve_for_complex_query(
"How do machine learning models differ from deep learning, and what are their use cases?"
)
HyDE: Hypothetical Document Embeddings
Instead of embedding the question, generate a hypothetical answer and embed that:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
class HyDERetriever:
"""Hypothetical Document Embeddings."""
def __init__(self, base_retriever):
self.model = ChatOpenAI()
self.embeddings = OpenAIEmbeddings()
self.base_retriever = base_retriever
def generate_hypothetical_document(self, query: str) -> str:
"""Generate a hypothetical document for the query."""
prompt = ChatPromptTemplate.from_template("""
Imagine a document that would answer this question.
Write a realistic answer that the document would contain.
Question: {query}
Hypothetical document:""")
response = self.model.invoke(
prompt.format_messages(query=query)
)
return response.content
def retrieve(self, query: str, k: int = 3):
"""Retrieve using hypothetical document."""
# Generate hypothetical answer
hyde_doc = self.generate_hypothetical_document(query)
# Embed it (rather than the original query)
hyde_embedding = self.embeddings.embed_query(hyde_doc)
# Use this for retrieval
# In practice, you'd search against your vector store directly
return self.base_retriever.get_relevant_documents(hyde_doc)
# Usage
hyde_retriever = HyDERetriever(base_retriever)
docs = hyde_retriever.retrieve("What are transformers in ML?")
Contextual Compression
Keep only relevant portions of retrieved documents:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMCompressor
from langchain_openai import ChatOpenAI
class CompressedRetriever:
"""Compress documents to keep only relevant parts."""
def __init__(self, base_retriever):
model = ChatOpenAI()
compressor = LLMCompressor.from_llm_and_prompt(
llm=model,
prompt=ChatPromptTemplate.from_template(
"Extract the parts of the following text that are relevant "
"to the question: {query}\n\nText:\n{text}"
)
)
self.retriever = ContextualCompressionRetriever(
base_compressor=compressor,
base_retriever=base_retriever
)
def retrieve(self, query: str):
"""Retrieve with compression."""
return self.retriever.get_relevant_documents(query)
# Usage
compressed = CompressedRetriever(base_retriever)
docs = compressed.retrieve("specific detail") # Only relevant excerpts
Combining Advanced Strategies
Use multiple techniques together:
class AdvancedRAGRetriever:
"""Combine multiple advanced retrieval strategies."""
def __init__(self, docs, use_decomposition=True, use_hyde=True, use_rerank=True):
embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(docs, embeddings)
base_retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
# Build retrieval pipeline
if use_hyde:
base_retriever = HyDERetriever(base_retriever)
if use_rerank:
base_retriever = RerankerSystem(base_retriever)
self.retriever = base_retriever
self.decomposer = QueryDecomposer() if use_decomposition else None
def retrieve(self, query: str):
"""Retrieve using combined strategies."""
if self.decomposer:
# Decompose and retrieve for each sub-question
sub_questions = self.decomposer.decompose(query)
all_docs = []
for sub_q in sub_questions:
all_docs.extend(self.retriever.get_relevant_documents(sub_q))
return all_docs
else:
# Simple retrieval
return self.retriever.get_relevant_documents(query)
# Usage
advanced = AdvancedRAGRetriever(split_docs)
docs = advanced.retrieve("complex multi-part question")
Evaluation of Retrieval Quality
Measure how good your retrieval is:
from typing import List
class RetrievelQualityEvaluator:
"""Evaluate retrieval quality."""
@staticmethod
def precision_at_k(relevant_docs: List[str], retrieved_docs: List[str], k: int = 5) -> float:
"""Precision: relevant documents in top-k retrieved."""
top_k = retrieved_docs[:k]
hits = sum(1 for doc in top_k if doc in relevant_docs)
return hits / k if k > 0 else 0
@staticmethod
def recall_at_k(relevant_docs: List[str], retrieved_docs: List[str], k: int = 5) -> float:
"""Recall: what fraction of relevant documents we retrieved."""
top_k = retrieved_docs[:k]
hits = sum(1 for doc in top_k if doc in relevant_docs)
return hits / len(relevant_docs) if relevant_docs else 0
@staticmethod
def mrr(relevant_docs: List[str], retrieved_docs: List[str]) -> float:
"""Mean Reciprocal Rank: position of first relevant document."""
for i, doc in enumerate(retrieved_docs):
if doc in relevant_docs:
return 1 / (i + 1)
return 0
# Usage
evaluator = RetrievelQualityEvaluator()
relevant = ["doc1", "doc2", "doc3"]
retrieved = ["doc1", "wrong", "doc2", "doc4", "doc3"]
precision = evaluator.precision_at_k(relevant, retrieved, k=5)
recall = evaluator.recall_at_k(relevant, retrieved, k=5)
mrr = evaluator.mrr(relevant, retrieved)
print(f"Precision@5: {precision:.2f}")
print(f"Recall@5: {recall:.2f}")
print(f"MRR: {mrr:.2f}")
Key Takeaway
Advanced retrieval goes beyond similarity search. Hybrid search combines semantic and keyword retrieval. Reranking refines results with expensive but accurate models. Query decomposition breaks complex questions into simpler sub-questions. HyDE uses hypothetical documents for better embeddings. Contextual compression keeps only relevant excerpts. Combine multiple strategies for best results. Evaluate retrieval quality with precision, recall, and MRR metrics.
Exercises
-
Hybrid search: Implement and test hybrid retrieval on a document set. Compare results to pure semantic search.
-
Reranking: Add reranking to your retriever. Verify reranked results are better than original ranking.
-
Query decomposition: Test decomposition on complex multi-part questions. Verify all aspects get answered.
-
HyDE: Implement HyDE retrieval and compare results to standard similarity search.
-
Evaluation: Test your retrieval on a golden dataset. Calculate precision, recall, and MRR.
-
Pipeline: Combine 2-3 advanced techniques. Measure impact on retrieval quality.