Performance Benchmarking — Enterprise-Scale RAG (Chapter 19)

Benchmarking establishes baseline performance metrics and validates that systems meet latency and throughput requirements. Without benchmarks, regressions go undetected until production incidents occur.

The benchmark suite structure:

import time
import statistics
import asyncio
from concurrent.futures import ThreadPoolExecutor

class RAGBenchmarkSuite:
    def __init__(self, vector_client, llm_endpoint: str):
        self.vector = vector_client
        self.llm_endpoint = llm_endpoint
    
    def benchmark_vector_search(
        self, 
        query: str,
        k: int = 10,
        iterations: int = 1000
    ) -> dict:
        """Measure vector search latency distribution"""
        embedding = self._embed_query(query)
        
        latencies = []
        for _ in range(iterations):
            start = time.perf_counter()
            results = self.vector.search(
                collection="chunks",
                query_vector=embedding,
                limit=k
            )
            elapsed = (time.perf_counter() - start) * 1000
            latencies.append(elapsed)
        
        return {
            "iterations": iterations,
            "avg_ms": statistics.mean(latencies),
            "p50_ms": statistics.quantiles(latencies, n=100)[50],
            "p95_ms": statistics.quantiles(latencies, n=100)[95],
            "p99_ms": statistics.quantiles(latencies, n=100)[99],
            "min_ms": min(latencies),
            "max_ms": max(latencies)
        }
    
    def benchmark_e2e_rag(self, queries: list[str], concurrency: int = 10) -> dict:
        """End-to-end RAG pipeline benchmark with concurrency"""
        async def single_query_latency(q: str) -> float:
            start = time.perf_counter()
            
            # Retrieval
            embedding = self._embed_query(q)
            retrieval_results = await self.vector.search_async(
                collection="chunks",
                query_vector=embedding,
                limit=5
            )
            retrieval_ms = (time.perf_counter() - start) * 1000
            
            # Augmentation
            context = "\n".join([r.payload for r in retrieval_results])
            prompt = f"Context: {context}\n\nQuestion: {q}"
            
            # Generation (mock)
            gen_start = time.perf_counter()
            # response = openai.ChatCompletion.create(...)  # Real call
            generation_ms = (time.perf_counter() - gen_start) * 1000
            
            return {
                "retrieval_ms": retrieval_ms,
                "generation_ms": generation_ms,
                "total_ms": (time.perf_counter() - start) * 1000
            }
        
        async def run_concurrent_queries():
            tasks = [single_query_latency(q) for q in queries]
            return await asyncio.gather(*tasks)
        
        results = asyncio.run(run_concurrent_queries())
        
        return {
            "concurrency": concurrency,
            "total_queries": len(queries),
            "throughput_qps": len(queries) / max(
                max(r["total_ms"] for r in results) / 1000, 0.001
            ),
            "avg_retrieval_ms": statistics.mean(r["retrieval_ms"] for r in results),
            "avg_generation_ms": statistics.mean(r["generation_ms"] for r in results),
            "avg_total_ms": statistics.mean(r["total_ms"] for r in results)
        }

Failure Modes:

Cold-start benchmarks: First query after deployment always shows anomalous latency. Always pre-warm with 100+ queries before recording metrics.
Hardware variance: Cloud VM performance varies by host. Run benchmarks against production-type instances, not shared development hardware.
Connection pooling: Creating new connections per query inflates latency 10-100x. Always use persistent connection pools.
Measuring non-representative queries: Synthetic queries may not reflect production distribution. Collect anonymized real queries for benchmarking.

Store benchmark results in a time-series database for trend analysis across deployments.