Document Search — Document Processing with Local AI (Chapter 16)

After processing documents into structured data, enabling search requires indexing content and supporting queries. This chapter covers implementing full-text search over document collections.

Search Architecture

Document search involves three components:

Index: Pre-computed structure enabling fast lookup
Query Engine: Parses queries and retrieves matching documents
Ranker: Orders results by relevance

Using SQLite FTS5 for Full-Text Search

SQLite's FTS5 extension provides embedded full-text search without external dependencies:

import sqlite3

def create_search_index(db_path):
    conn = sqlite3.connect(db_path)
    conn.execute("""
        CREATE VIRTUAL TABLE documents USING fts5(
            path,
            content,
            metadata,
            content_rowid='rowid'
        )
    """)
    conn.commit()
    return conn

def index_document(conn, path, content, metadata):
    conn.execute(
        "INSERT INTO documents (path, content, metadata) VALUES (?, ?, ?)",
        (path, content, json.dumps(metadata))
    )
    conn.commit()

def search_documents(conn, query, limit=10):
    cursor = conn.execute("""
        SELECT path, snippet(documents, 1, '<b>', '</b>', '...', 32) as snippet
        FROM documents
        WHERE documents MATCH ?
        ORDER BY rank
        LIMIT ?
    """, (query, limit))
    return cursor.fetchall()

BM25 Ranking

FTS5 uses BM25 ranking by default. For more control, implement custom ranking:

import math

def bm25_score(term_freq, doc_len, avg_doc_len, doc_count, term_doc_freq, k1=1.5, b=0.75):
    idf = math.log((doc_count - term_doc_freq + 0.5) / (term_doc_freq + 0.5))
    tf_norm = (term_freq * (k1 + 1)) / (term_freq + k1 * (1 - b + b * (doc_len / avg_doc_len)))
    return idf * tf_norm

def rank_results(results, query_terms):
    doc_lengths = {r["path"]: len(r["content"].split()) for r in results}
    avg_len = sum(doc_lengths.values()) / len(doc_lengths) if doc_lengths else 1
    doc_count = len(results)
    
    scores = {}
    for result in results:
        score = 0
        content_lower = result["content"].lower()
        for term in query_terms:
            tf = content_lower.count(term.lower())
            doc_freq = sum(1 for r in results if term.lower() in r["content"].lower())
            score += bm25_score(tf, doc_lengths[result["path"]], avg_len, doc_count, doc_freq)
        scores[result["path"]] = score
    
    return sorted(results, key=lambda r: scores[r["path"]], reverse=True)

Semantic Search with Embeddings

For natural language queries, convert query and documents to vectors and find closest matches:

import numpy as np

def embed_text(text, model):
    embeddings = model.encode([text])
    return embeddings[0]

def semantic_search(documents, query, embed_model, top_k=5):
    query_embedding = embed_text(query, embed_model)
    
    results = []
    for doc in documents:
        doc_embedding = embed_text(doc["content"][:5000], embed_model)
        similarity = np.dot(query_embedding, doc_embedding) / (
            np.linalg.norm(query_embedding) * np.linalg.norm(doc_embedding)
        )
        results.append({
            "path": doc["path"],
            "similarity": float(similarity),
            "content": doc["content"][:500]
        })
    
    return sorted(results, key=lambda r: r["similarity"], reverse=True)[:top_k]

Hybrid Search

Combine keyword and semantic search:

def hybrid_search(conn, query, embed_model, documents, top_k=10, keyword_weight=0.5):
    keyword_results = search_documents(conn, query, limit=top_k * 2)
    semantic_results = semantic_search(documents, query, embed_model, top_k=top_k * 2)
    
    keyword_paths = {r[0]: score for score, r in enumerate(keyword_results)}
    semantic_paths = {r["path"]: score for score, r in enumerate(semantic_results)}
    
    all_paths = set(keyword_paths.keys()) | set(semantic_paths.keys())
    
    scored_results = []
    for path in all_paths:
        kw_score = 1 / (keyword_paths.get(path, len(all_paths)) + 1)
        sem_score = 1 / (semantic_paths.get(path, len(all_paths)) + 1)
        combined = keyword_weight * kw_score + (1 - keyword_weight) * sem_score
        scored_results.append((combined, path))
    
    return sorted(scored_results, reverse=True)[:top_k]