18. Search Engine Project

Chapter 18 of 18 · 25 min

KEY INSIGHT

Build a complete document Q&A system by combining semantic search with retrieval-augmented generation. This final chapter integrates everything: document indexing, metadata filtering, semantic search, and presenting results. ```python from sentence_transformers import SentenceTransformer import chromadb from typing import List, Dict, Optional import hashlib import json from pathlib import Path class DocumentQASystem: """ Complete document Q&A system with semantic search. Supports: - Batch document ingestion - Metadata filtering - Semantic similarity search - Result ranking and confidence scoring """ def __init__( self, persist_directory: str = "./qa_index", model_name: str = "all-MiniLM-L6-v2" ): self.model = SentenceTransformer(model_name) self.dimension = self.model.get_sentence_embedding_dimension() # Initialize ChromaDB self.client = chromadb.PersistentClient(path=persist_directory) self.collection = self.client.get_or_create_collection( name="documents", embedding_function=self.model, metadata={"hnsw:space": "cosine"} ) # Initialize cache self.cache_dir = Path(persist_directory) / "cache" self.cache_dir.mkdir(exist_ok=True) self.cache_index = self.cache_dir / "index.json" self._load_cache_index() def _load_cache_index(self): if self.cache_index.exists(): with open(self.cache_index) as f: self.cache = json.load(f) else: self.cache = {} def _save_cache_index(self): with open(self.cache_index, 'w') as f: json.dump(self.cache, f) def ingest( self, documents: List[str], metadatas: Optional[List[Dict]] = None, batch_size: int = 100 ) -> int: """Ingest documents with progress reporting.""" if metadatas is None: metadatas = [{}] * len(documents) ids = [self._generate_id(doc) for doc in documents] total_indexed = 0 for i in range(0, len(documents), batch_size): batch_docs = documents[i:i + batch_size] batch_ids = ids[i:i + batch_size] batch_metas = metadatas[i:i + batch_size] self.collection.add( documents=batch_docs, ids=batch_ids, metadatas=batch_metas ) total_indexed += len(batch_docs) print(f"Indexed {total_indexed}/{len(documents)} documents") return total_indexed def _generate_id(self, text: str) -> str: """Generate deterministic ID from content hash.""" return hashlib.sha256(text.encode()).hexdigest()[:16] def search( self, query: str, top_k: int = 5, filters: Optional[Dict] = None, min_score: float = 0.0 ) -> List[Dict]: """Search for relevant documents.""" results = self.collection.query( query_texts=[query], n_results=top_k, where=filters, include=["documents", "metadatas", "distances"] ) documents = [] for i in range(len(results['ids'][0])): distance = results['distances'][0][i] # Convert distance to similarity score (0-1, higher is better) similarity = 1 / (1 + distance) if similarity >= min_score: documents.append({ "id": results['ids'][0][i], "content": results['documents'][0][i], "metadata": results['metadatas'][0][i], "similarity": similarity, "distance": distance }) return documents def ask( self, question: str, context_docs: int = 3, filters: Optional[Dict] = None ) -> Dict: """ Answer a question by finding relevant documents. Returns the most relevant documents and suggests an answer based on retrieved context. """ relevant_docs = self.search( question, top_k=context_docs, filters=filters, min_score=0.1 ) if not relevant_docs: return { "answer": "No relevant documents found.", "sources": [], "question": question } # Build context from top documents context = "\n\n".join([ f"[Source {i+1}]: {doc['content']}" for i, doc in enumerate(relevant_docs) ]) # Format response return { "question": question, "answer": f"Based on {len(relevant_docs)} relevant source(s):\n\n{context}", "sources": [ { "content": doc['content'][:200] + "..." if len(doc['content']) > 200 else doc['content'], "metadata": doc['metadata'], "confidence": f"{doc['similarity']:.2%}" } for doc in relevant_docs ], "total_found": len(relevant_docs) } def stats(self) -> Dict: """Get index statistics.""" return { "total_documents": self.collection.count(), "embedding_dimension": self.dimension, "model": self.model.model_name, "collection_name": self.collection.name } # Demo usage if __name__ == "__main__": # Initialize system qa = DocumentQASystem(persist_directory="./demo_qa") # Sample documents documents = [ ("Python was created by Guido van Rossum in 1991.", {"topic": "python", "year": 1991}), ("Python supports multiple programming styles including OOP.", {"topic": "python", "concept": "styles"}), ("FastAPI is a modern Python web framework for building APIs.", {"topic": "fastapi", "category": "framework"}), ("ChromaDB is a vector database for AI applications.", {"topic": "chromadb", "category": "database"}), ("FAISS is a library for efficient similarity search.", {"topic": "faiss", "category": "library"}), ("Embeddings convert text to numerical vectors.", {"topic": "embeddings", "concept": "vectors"}), ("Docker containers package applications with their dependencies.", {"topic": "docker", "category": "devops"}), ("Kubernetes automates deployment and scaling of containers.", {"topic": "kubernetes", "category": "devops"}), ] # Ingest documents print("Ingesting documents...") texts = [d[0] for d in documents] metas = [d[1] for d in documents] qa.ingest(texts, metas) # Show stats print(f"\nIndex stats: {qa.stats()}") # Run queries print("\n" + "="*60) print("QUERY 1: 'Tell me about Python programming'") print("="*60) result = qa.ask("Tell me about Python programming") print(result["answer"]) print(f"\nConfidence scores: {[s['confidence'] for s in result['sources']]}") print("\n" + "="*60) print("QUERY 2: 'What is vector database technology?'") print("="*60) result = qa.ask("What is vector database technology?") print(result["answer"]) print("\n" + "="*60) print("QUERY 3: Filter by topic='devops'") print("="*60) result = qa.ask("deployment and scaling", filters={"topic": {"$eq": "devops"}}) print(result["answer"]) ```

Summary

You now have a working semantic search system:

Embeddings convert text to 384-dimensional vectors that capture meaning
ChromaDB stores vectors with metadata and supports filtering
FAISS provides faster search for very large datasets
LangChain offers abstractions for swapping backends
Batch processing handles thousands of documents efficiently
Caching avoids recomputing embeddings unnecessarily
Persistence ensures your index survives restarts

The DocumentQASystem in Chapter 18 is production-ready for moderate workloads. For billions of documents, migrate to FAISS with IVF indexes or dedicated vector databases like Qdrant or Weaviate running as services.

Key files to keep:

# Your index directory (ChromaDB persists here)
./qa_index/

# Your embedding model cache (sentence-transformers)
~/.cache/huggingface/

# Backup before any destructive operations
./backup_YYYYMMDD_HHMMSS/

EXERCISE

Extend the DocumentQASystem with:

Document deletion support (delete_document(doc_id))
Update support (update_document(doc_id, new_text, new_metadata))
A bulk_search method that accepts multiple queries and returns results for all
Persistence of query history with timestamps

Run queries, verify results, and demonstrate all features work together as a cohesive system.

Summary

You now have a working semantic search system:

Embeddings convert text to 384-dimensional vectors that capture meaning
ChromaDB stores vectors with metadata and supports filtering
FAISS provides faster search for very large datasets
LangChain offers abstractions for swapping backends
Batch processing handles thousands of documents efficiently
Caching avoids recomputing embeddings unnecessarily
Persistence ensures your index survives restarts

Key files to keep:

# Your index directory (ChromaDB persists here)
./qa_index/

# Your embedding model cache (sentence-transformers)
~/.cache/huggingface/

# Backup before any destructive operations
./backup_YYYYMMDD_HHMMSS/