16. Document Search
Chapter 16 of 18 · 25 min
After processing documents into structured data, enabling search requires indexing content and supporting queries. This chapter covers implementing full-text search over document collections.
Search Architecture
Document search involves three components:
- Index: Pre-computed structure enabling fast lookup
- Query Engine: Parses queries and retrieves matching documents
- Ranker: Orders results by relevance
Using SQLite FTS5 for Full-Text Search
SQLite's FTS5 extension provides embedded full-text search without external dependencies:
import sqlite3
def create_search_index(db_path):
conn = sqlite3.connect(db_path)
conn.execute("""
CREATE VIRTUAL TABLE documents USING fts5(
path,
content,
metadata,
content_rowid='rowid'
)
""")
conn.commit()
return conn
def index_document(conn, path, content, metadata):
conn.execute(
"INSERT INTO documents (path, content, metadata) VALUES (?, ?, ?)",
(path, content, json.dumps(metadata))
)
conn.commit()
def search_documents(conn, query, limit=10):
cursor = conn.execute("""
SELECT path, snippet(documents, 1, '<b>', '</b>', '...', 32) as snippet
FROM documents
WHERE documents MATCH ?
ORDER BY rank
LIMIT ?
""", (query, limit))
return cursor.fetchall()
BM25 Ranking
FTS5 uses BM25 ranking by default. For more control, implement custom ranking:
import math
def bm25_score(term_freq, doc_len, avg_doc_len, doc_count, term_doc_freq, k1=1.5, b=0.75):
idf = math.log((doc_count - term_doc_freq + 0.5) / (term_doc_freq + 0.5))
tf_norm = (term_freq * (k1 + 1)) / (term_freq + k1 * (1 - b + b * (doc_len / avg_doc_len)))
return idf * tf_norm
def rank_results(results, query_terms):
doc_lengths = {r["path"]: len(r["content"].split()) for r in results}
avg_len = sum(doc_lengths.values()) / len(doc_lengths) if doc_lengths else 1
doc_count = len(results)
scores = {}
for result in results:
score = 0
content_lower = result["content"].lower()
for term in query_terms:
tf = content_lower.count(term.lower())
doc_freq = sum(1 for r in results if term.lower() in r["content"].lower())
score += bm25_score(tf, doc_lengths[result["path"]], avg_len, doc_count, doc_freq)
scores[result["path"]] = score
return sorted(results, key=lambda r: scores[r["path"]], reverse=True)
Semantic Search with Embeddings
For natural language queries, convert query and documents to vectors and find closest matches:
import numpy as np
def embed_text(text, model):
embeddings = model.encode([text])
return embeddings[0]
def semantic_search(documents, query, embed_model, top_k=5):
query_embedding = embed_text(query, embed_model)
results = []
for doc in documents:
doc_embedding = embed_text(doc["content"][:5000], embed_model)
similarity = np.dot(query_embedding, doc_embedding) / (
np.linalg.norm(query_embedding) * np.linalg.norm(doc_embedding)
)
results.append({
"path": doc["path"],
"similarity": float(similarity),
"content": doc["content"][:500]
})
return sorted(results, key=lambda r: r["similarity"], reverse=True)[:top_k]
Hybrid Search
Combine keyword and semantic search:
def hybrid_search(conn, query, embed_model, documents, top_k=10, keyword_weight=0.5):
keyword_results = search_documents(conn, query, limit=top_k * 2)
semantic_results = semantic_search(documents, query, embed_model, top_k=top_k * 2)
keyword_paths = {r[0]: score for score, r in enumerate(keyword_results)}
semantic_paths = {r["path"]: score for score, r in enumerate(semantic_results)}
all_paths = set(keyword_paths.keys()) | set(semantic_paths.keys())
scored_results = []
for path in all_paths:
kw_score = 1 / (keyword_paths.get(path, len(all_paths)) + 1)
sem_score = 1 / (semantic_paths.get(path, len(all_paths)) + 1)
combined = keyword_weight * kw_score + (1 - keyword_weight) * sem_score
scored_results.append((combined, path))
return sorted(scored_results, reverse=True)[:top_k]
EXERCISE
Build a search interface that indexes documents from a processed directory, supports both keyword and semantic queries, returns snippets with highlighted matches, and logs query statistics for analysis.