Performance Benchmarks — Voice AI with Local Models (Chapter 21)

Systematic benchmarking quantifies voice pipeline capabilities and identifies optimization opportunities.

Benchmark Framework

import time
import statistics
import psutil
import torch
from dataclasses import dataclass
from typing import Callable

@dataclass
class BenchmarkResult:
    name: str
    mean_ms: float
    std_ms: float
    p50_ms: float
    p95_ms: float
    p99_ms: float
    throughput: float  # items per second

class VoicePipelineBenchmark:
    def __init__(self, pipeline):
        self.pipeline = pipeline
        self.results = {}
    
    def benchmark_latency(
        self,
        func: Callable,
        inputs: list,
        iterations: int = 100,
        warmup: int = 10
    ) -> BenchmarkResult:
        # Warmup
        for _ in range(warmup):
            func(inputs[0])
        
        latencies = []
        for input_data in inputs * (iterations // len(inputs)):
            start = time.perf_counter()
            result = func(input_data)
            if torch.cuda.is_available():
                torch.cuda.synchronize()
            latencies.append((time.perf_counter() - start) * 1000)
        
        return self._compute_stats(func.__name__, latencies)
    
    def _compute_stats(self, name: str, latencies: list[float]) -> BenchmarkResult:
        sorted_latencies = sorted(latencies)
        return BenchmarkResult(
            name=name,
            mean_ms=statistics.mean(latencies),
            std_ms=statistics.stdev(latencies),
            p50_ms=sorted_latencies[len(sorted_latencies) // 2],
            p95_ms=sorted_latencies[int(len(sorted_latencies) * 0.95)],
            p99_ms=sorted_latencies[int(len(sorted_latencies) * 0.99)],
            throughput=1000 / statistics.mean(latencies)
        )
    
    def benchmark_memory(self, func: Callable, input_data) -> dict:
        if torch.cuda.is_available():
            torch.cuda.reset_peak_memory_stats()
            torch.cuda.empty_cache()
        
        initial_memory = psutil.virtual_memory().used
        
        func(input_data)
        
        if torch.cuda.is_available():
            peak_gpu_memory = torch.cuda.max_memory_allocated() / 1024**2
        else:
            peak_gpu_memory = 0
        
        final_memory = psutil.virtual_memory().used
        
        return {
            "system_memory_delta_mb": (final_memory - initial_memory) / 1024**2,
            "peak_gpu_memory_mb": peak_gpu_memory
        }

ASR Benchmark

class ASRBenchmark:
    def __init__(self, model):
        self.model = model
        self.test_audio = self._load_test_set()
    
    def _load_test_set(self) -> list[dict]:
        return [
            {"path": "test/audio/clean_1.wav", "expected": "hello world"},
            {"path": "test/audio/clean_2.wav", "expected": "how are you today"},
            # ... more test files
        ]
    
    def benchmark(self, iterations: int = 50) -> dict:
        results = []
        
        for audio_path in self.test_audio:
            audio = load_audio(audio_path["path"])
            
            # Measure transcription time
            start = time.perf_counter()
            transcription = self.model.transcribe(audio)
            if torch.cuda.is_available():
                torch.cuda.synchronize()
            elapsed = (time.perf_counter() - start) * 1000
            
            results.append({
                "file": audio_path["path"],
                "time_ms": elapsed,
                "transcription": transcription,
                "expected": audio_path["expected"],
                "wer": self._compute_wer(transcription, audio_path["expected"])
            })
        
        avg_latency = statistics.mean(r["time_ms"] for r in results)
        avg_wer = statistics.mean(r["wer"] for r in results)
        
        return {
            "avg_latency_ms": avg_latency,
            "avg_wer": avg_wer,  # Word Error Rate
            "max_latency_ms": max(r["time_ms"] for r in results),
            "detail": results
        }
    
    def _compute_wer(self, hypothesis: str, reference: str) -> float:
        hyp_tokens = hypothesis.lower().split()
        ref_tokens = reference.lower().split()
        
        # Simple Levenshtein distance
        d = [[0] * (len(ref_tokens) + 1) for _ in range(len(hyp_tokens) + 1)]
        
        for i in range(len(hyp_tokens) + 1):
            d[i][0] = i
        for j in range(len(ref_tokens) + 1):
            d[0][j] = j
        
        for i in range(1, len(hyp_tokens) + 1):
            for j in range(1, len(ref_tokens) + 1):
                if hyp_tokens[i-1] == ref_tokens[j-1]:
                    d[i][j] = d[i-1][j-1]
                else:
                    d[i][j] = 1 + min(d[i-1][j], d[i][j-1], d[i-1][j-1])
        
        return d[len(hyp_tokens)][len(ref_tokens)] / len(ref_tokens)

TTS Benchmark

class TTSBenchmark:
    def __init__(self, model):
        self.model = model
        self.test_texts = [
            "Short sentence.",
            "This is a medium length sentence with multiple words.",
            "A much longer sentence that contains many more words and should take longer to synthesize, especially when using more complex models with attention mechanisms.",
            " ".join(["word"] * 100)  # Very long text
        ]
    
    def benchmark(self, iterations: int = 20) -> dict:
        results = []
        
        for text in self.test_texts:
            latencies = []
            
            for _ in range(iterations):
                start = time.perf_counter()
                audio = self.model.synthesize(text)
                if torch.cuda.is_available():
                    torch.cuda.synchronize()
                latencies.append((time.perf_counter() - start) * 1000)
            
            results.append({
                "text_length": len(text),
                "word_count": len(text.split()),
                "audio_duration_s": len(audio) / 16000,
                "rtf": len(audio) / 16000 / (statistics.mean(latencies) / 1000),
                "latency_ms": statistics.mean(latencies)
            })
        
        return {
            "results": results,
            "avg_rtf": statistics.mean(r["rtf"] for r in results),
            "avg_latency_ms": statistics.mean(r["latency_ms"] for r in results)
        }

Full Pipeline Benchmark

class FullPipelineBenchmark:
    def __init__(self, pipeline):
        self.pipeline = pipeline
        self.test_cases = self._load_test_cases()
    
    def _load_test_cases(self) -> list[dict]:
        return [
            {"audio": "test/audio/question.wav", "type": "question"},
            {"audio": "test/audio/statement.wav", "type": "statement"},
            {"audio": "test/audio/complex.wav", "type": "complex"}
        ]
    
    def benchmark(self, iterations: int = 30) -> dict:
        results = []
        
        for test_case in self.test_cases:
            audio = load_audio(test_case["audio"])
            latencies = {"total": [], "asr": [], "llm": [], "tts": []}
            
            for _ in range(iterations):
                start = time.perf_counter()
                result = await self.pipeline.process(audio)
                if torch.cuda.is_available():
                    torch.cuda.synchronize()
                total_time = (time.perf_counter() - start) * 1000
                
                latencies["total"].append(total_time)
                latencies["asr"].append(result["asr_time_ms"])
                latencies["llm"].append(result["llm_time_ms"])
                latencies["tts"].append(result["tts_time_ms"])
            
            results.append({
                "type": test_case["type"],
                "total": statistics.mean(latencies["total"]),
                "asr": statistics.mean(latencies["asr"]),
                "llm": statistics.mean(latencies["llm"]),
                "tts": statistics.mean(latencies["tts"])
            })
        
        return {
            "average_total_ms": statistics.mean(r["total"] for r in results),
            "by_type": results,
            "p95_total_ms": statistics.mean([
                sorted([r["total"] for r in results])[int(len(results) * 0.95)]
            ])
        }

Benchmark Report

def generate_benchmark_report(benchmarks: dict) -> str:
    report = ["# Voice Pipeline Benchmark Report", ""]
    report.append(f"Date: {datetime.now().isoformat()}")
    report.append(f"System: {platform.platform()}")
    report.append(f"Python: {platform.python_version()}")
    report.append(f"PyTorch: {torch.__version__}")
    report.append("")
    
    if torch.cuda.is_available():
        report.append(f"GPU: {torch.cuda.get_device_name(0)}")
        report.append(f"CUDA: {torch.version.cuda}")
        report.append("")
    
    for name, result in benchmarks.items():
        report.append(f"## {name}")
        report.append(f"- Mean: {result.mean_ms:.2f}ms")
        report.append(f"- Std: {result.std_ms:.2f}ms")
        report.append(f"- P95: {result.p95_ms:.2f}ms")
        report.append(f"- Throughput: {result.throughput:.2f} req/s")
        report.append("")
    
    return "\n".join(report)