Agentic RAG: Dynamic Retrieval Strategies for Smarter, Context-Aware Answers
This tutorial demonstrates how to build an Agentic Retrieval-Augmented Generation (RAG) system that does more than simple document lookup. The agent decides whether retrieval is necessary, selects an appropriate retrieval strategy, and synthesizes answers with contextual awareness — combining embeddings, FAISS indexing, and an LLM (mocked here) to illustrate the pipeline.
Agentic components and a mock LLM
We start by defining a lightweight mock LLM, a retrieval strategy enum, and a Document dataclass to structure our knowledge base. The mock LLM simulates decision-making prompts used throughout the pipeline.
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import json
import re
from typing import List, Dict, Any, Optional
from dataclasses import dataclass
from enum import Enum
class MockLLM:
def generate(self, prompt: str, max_tokens: int = 150) -> str:
prompt_lower = prompt.lower()
if "decide whether to retrieve" in prompt_lower:
if any(word in prompt_lower for word in ["specific", "recent", "data", "facts", "when", "who", "what"]):
return "RETRIEVE: The query requires specific factual information that needs to be retrieved."
else:
return "NO_RETRIEVE: This is a general question that can be answered with existing knowledge."
elif "choose retrieval strategy" in prompt_lower:
if "comparison" in prompt_lower or "versus" in prompt_lower:
return "STRATEGY: multi_query - Need to retrieve information about multiple entities for comparison."
elif "recent" in prompt_lower or "latest" in prompt_lower:
return "STRATEGY: temporal - Focus on recent information."
else:
return "STRATEGY: semantic - Standard semantic similarity search."
elif "synthesize" in prompt_lower and "context:" in prompt_lower:
return "Based on the retrieved information, here's a comprehensive answer that combines multiple sources and provides specific details with proper context."
return "This is a mock response. In practice, use a real LLM like OpenAI's GPT or similar."
class RetrievalStrategy(Enum):
SEMANTIC = "semantic"
MULTI_QUERY = "multi_query"
TEMPORAL = "temporal"
HYBRID = "hybrid"
@dataclass
class Document:
id: str
content: str
metadata: Dict[str, Any]
embedding: Optional[np.ndarray] = None
Initializing embeddings and FAISS index
The system encodes document contents with a sentence transformer, normalizes embeddings, and builds a FAISS index for efficient similarity search.
class AgenticRAGSystem:
def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
self.encoder = SentenceTransformer(model_name)
self.llm = MockLLM()
self.documents: List[Document] = []
self.index: Optional[faiss.Index] = None
def add_documents(self, documents: List[Dict[str, Any]]) -> None:
print(f"Processing {len(documents)} documents...")
for i, doc in enumerate(documents):
doc_obj = Document(
id=doc.get('id', str(i)),
content=doc['content'],
metadata=doc.get('metadata', {})
)
self.documents.append(doc_obj)
contents = [doc.content for doc in self.documents]
embeddings = self.encoder.encode(contents, show_progress_bar=True)
for doc, embedding in zip(self.documents, embeddings):
doc.embedding = embedding
dimension = embeddings.shape[1]
self.index = faiss.IndexFlatIP(dimension)
faiss.normalize_L2(embeddings)
self.index.add(embeddings.astype('float32'))
print(f"Knowledge base built with {len(self.documents)} documents")
Decision-making and strategy selection
Before retrieving, the agent evaluates if retrieval is necessary and which retrieval strategy best fits the query. This enables targeted, efficient searches rather than always hitting the knowledge base.
def decide_retrieval(self, query: str) -> bool:
decision_prompt = f"""
Analyze the following query and decide whether to retrieve information:
Query: "{query}"
Decide whether to retrieve information from the knowledge base.
Consider if this needs specific facts, recent data, or can be answered generally.
Respond with either:
RETRIEVE: [reason] or NO_RETRIEVE: [reason]
"""
response = self.llm.generate(decision_prompt)
should_retrieve = response.startswith("RETRIEVE:")
print(f" Agent Decision: {'Retrieve' if should_retrieve else 'Direct Answer'}")
print(f" Reasoning: {response.split(':', 1)[1].strip() if ':' in response else response}")
return should_retrieve
def choose_strategy(self, query: str) -> RetrievalStrategy:
strategy_prompt = f"""
Choose the best retrieval strategy for this query:
Query: "{query}"
Available strategies:
- semantic: Standard similarity search
- multi_query: Multiple related queries (for comparisons)
- temporal: Focus on recent information
- hybrid: Combination approach
Choose retrieval strategy and explain why.
Respond with: STRATEGY: [strategy_name] - [reasoning]
"""
response = self.llm.generate(strategy_prompt)
if "multi_query" in response.lower():
strategy = RetrievalStrategy.MULTI_QUERY
elif "temporal" in response.lower():
strategy = RetrievalStrategy.TEMPORAL
elif "hybrid" in response.lower():
strategy = RetrievalStrategy.HYBRID
else:
strategy = RetrievalStrategy.SEMANTIC
print(f" Retrieval Strategy: {strategy.value}")
print(f" Reasoning: {response.split('-', 1)[1].strip() if '-' in response else response}")
return strategy
Retrieval approaches and synthesis
The system supports semantic search, multi-query aggregation for comparisons, and temporal re-ranking for recent information. Retrieved documents are deduplicated and used to synthesize a contextual answer.
def retrieve_documents(self, query: str, strategy: RetrievalStrategy, k: int = 3) -> List[Document]:
if not self.index:
print(" No knowledge base available")
return []
if strategy == RetrievalStrategy.MULTI_QUERY:
queries = [query, f"advantages of {query}", f"disadvantages of {query}"]
all_docs = []
for q in queries:
docs = self._semantic_search(q, k=2)
all_docs.extend(docs)
seen_ids = set()
unique_docs = []
for doc in all_docs:
if doc.id not in seen_ids:
unique_docs.append(doc)
seen_ids.add(doc.id)
return unique_docs[:k]
elif strategy == RetrievalStrategy.TEMPORAL:
docs = self._semantic_search(query, k=k*2)
docs_with_dates = [(doc, doc.metadata.get('date', '1900-01-01')) for doc in docs]
docs_with_dates.sort(key=lambda x: x[1], reverse=True)
return [doc for doc, _ in docs_with_dates[:k]]
else:
return self._semantic_search(query, k=k)
def _semantic_search(self, query: str, k: int) -> List[Document]:
query_embedding = self.encoder.encode([query])
faiss.normalize_L2(query_embedding)
scores, indices = self.index.search(query_embedding.astype('float32'), k)
results = []
for score, idx in zip(scores[0], indices[0]):
if idx < len(self.documents):
results.append(self.documents[idx])
return results
def synthesize_response(self, query: str, retrieved_docs: List[Document]) -> str:
if not retrieved_docs:
return self.llm.generate(f"Answer this query: {query}")
context = "\n\n".join([f"Document {i+1}: {doc.content}"
for i, doc in enumerate(retrieved_docs)])
synthesis_prompt = f"""
Query: {query}
Context: {context}
Synthesize a comprehensive answer using the provided context.
Be specific and reference the information sources when relevant.
"""
return self.llm.generate(synthesis_prompt, max_tokens=200)
End-to-end query flow
A single query run stitches together decision-making, strategy selection, retrieval, and synthesis, printing reasoning and retrieved context for transparency.
def query(self, query: str) -> str:
print(f"\n Processing Query: '{query}'")
print("=" * 50)
if not self.decide_retrieval(query):
print("\n Generating direct response...")
return self.llm.generate(f"Answer this query: {query}")
strategy = self.choose_strategy(query)
print(f"\n Retrieving documents using {strategy.value} strategy...")
retrieved_docs = self.retrieve_documents(query, strategy)
print(f" Retrieved {len(retrieved_docs)} documents")
print("\n Synthesizing response...")
response = self.synthesize_response(query, retrieved_docs)
if retrieved_docs:
print("\n Retrieved Context:")
for i, doc in enumerate(retrieved_docs[:2], 1):
print(f" {i}. {doc.content[:100]}...")
return response
Demo and sample knowledge base
A runnable demo builds a small knowledge base, initializes the system, and executes sample queries that show retrieval decisions and different strategies in action.
def create_sample_knowledge_base():
return [
{
"id": "ai_1",
"content": "Artificial Intelligence (AI) refers to computer systems that can perform tasks requiring human intelligence",
"metadata": {"topic": "AI basics", "date": "2024-01-15"}
},
{
"id": "ml_1",
"content": "ML is a subset of AI.",
"metadata": {"topic": "Machine Learning", "date": "2024-02-10"}
},
{
"id": "rag_1",
"content": "Retrieval-Augmented Generation (RAG) combines the power of large language models with external knowledge retrieval to provide more accurate and up-to-date responses.",
"metadata": {"topic": "RAG", "date": "2024-03-05"}
},
{
"id": "agents_1",
"content": "AI agents",
"metadata": {"topic": "AI Agents", "date": "2024-03-20"}
}
]
if __name__ == "__main__":
print(" Initializing Agentic RAG System...")
rag_system = AgenticRAGSystem()
docs = create_sample_knowledge_base()
rag_system.add_documents(docs)
demo_queries = [
"What is artificial intelligence?",
"How are you today?",
"Compare AI and Machine Learning",
]
for query in demo_queries:
response = rag_system.query(query)
print(f"\n Final Response: {response}")
print("\n" + "="*80)
print("\n Agentic RAG Tutorial Complete!")
print("\nKey Features Demonstrated:")
print("• Agent-driven retrieval decisions")
print("• Dynamic strategy selection")
print("• Multi-modal retrieval approaches")
print("• Transparent reasoning process")
Next steps and extensions
This foundation demonstrates how adding agency to RAG improves relevance and transparency. Replace the mock LLM with a real model, expand the knowledge base, tune retrieval strategies, and experiment with hybrid ranking or external temporal signals to make the system production-ready.