Testing Voice Pipelines — Voice AI with Local Models (Chapter 19)

Testing voice AI systems requires audio-specific fixtures, latency assertions, and quality validation beyond standard unit testing.

Audio Test Fixtures

import pytest
import numpy as np
from pathlib import Path

@pytest.fixture
def audio_samples():
    return {
        "clean_speech": load_audio("tests/fixtures/clean_speech.wav"),
        "noisy_speech": load_audio("tests/fixtures/noisy_speech.wav"),
        "silence": load_audio("tests/fixtures/silence.wav"),
        "multi_speaker": load_audio("tests/fixtures/multi_speaker.wav")
    }

@pytest.fixture
def mock_microphone(audio_samples):
    from unittest.mock import MagicMock
    mic = MagicMock()
    mic.read.return_value = audio_samples["clean_speech"].tobytes()
    return mic

def load_audio(path: str) -> np.ndarray:
    import soundfile as sf
    return sf.read(path)[0]

@pytest.fixture
def temp_audio_file():
    content = np.random.randint(-1000, 1000, size=16000, dtype=np.int16)
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
        sf.write(f.name, content, 16000)
        yield f.name
    os.unlink(f.name)

Unit Tests for Components

class TestNoiseReduction:
    def test_reduces_noise_floor(self, audio_samples):
        reducer = NoiseReductionPipeline()
        output = reducer.process(audio_samples["noisy_speech"])
        
        noise_floor_before = np.mean(np.abs(audio_samples["noisy_speech"]))
        noise_floor_after = np.mean(np.abs(output))
        
        assert noise_floor_after < noise_floor_before * 0.8
    
    def test_preserves_speech_quality(self, audio_samples):
        reducer = NoiseReductionPipeline()
        output = reducer.process(audio_samples["clean_speech"])
        
        correlation = np.corrcoef(
            audio_samples["clean_speech"].astype(float),
            output.astype(float)
        )[0, 1]
        
        assert correlation > 0.9

class TestLanguageDetection:
    @pytest.mark.asyncio
    async def test_detects_english(self, audio_samples):
        detector = LanguageDetector()
        lang = await detector.detect(audio_samples["english_speech"])
        
        assert lang == "en"
    
    @pytest.mark.asyncio
    async def test_detects_spanish(self, audio_samples):
        detector = LanguageDetector()
        lang = await detector.detect(audio_samples["spanish_speech"])
        
        assert lang == "es"

class TestTTS:
    def test_output_shape(self):
        tts = CoquiTTS()
        audio = tts.synthesize("Hello world")
        
        assert audio.dtype == np.float32
        assert len(audio) > 16000  # At least 1 second
        assert -1.0 <= audio.min() <= 1.0
        assert -1.0 <= audio.max() <= 1.0

Latency Testing

import time

class TestLatency:
    @pytest.mark.asyncio
    async def test_end_to_end_latency(self, audio_samples):
        pipeline = VoicePipeline()
        start = time.perf_counter()
        
        result = await pipeline.process(audio_samples["clean_speech"])
        
        latency_ms = (time.perf_counter() - start) * 1000
        
        assert latency_ms < 500, f"Latency {latency_ms}ms exceeds 500ms threshold"
    
    @pytest.mark.asyncio
    async def test_component_latencies(self, audio_samples):
        pipeline = VoicePipeline()
        
        with LatencyTracker() as tracker:
            await pipeline.process(audio_samples["clean_speech"])
        
        report = tracker.get_report()
        
        assert report["asr"] < 200, f"ASR took {report['asr']}ms"
        assert report["llm"] < 300, f"LLM took {report['llm']}ms"
        assert report["tts"] < 150, f"TTS took {report['tts']}ms"

@pytest.fixture
def latency_tracker():
    return LatencyTracker()

class LatencyTracker:
    def __init__(self):
        self.measurements = {}
        self.start_times = {}
    
    def __enter__(self):
        return self
    
    def start(self, component: str):
        self.start_times[component] = time.perf_counter()
    
    def end(self, component: str):
        if component in self.start_times:
            elapsed = (time.perf_counter() - self.start_times[component]) * 1000
            self.measurements[component] = elapsed
    
    def get_report(self) -> dict:
        return self.measurements.copy()

Integration Tests

@pytest.mark.asyncio
async def test_full_pipeline(audio_samples):
    pipeline = VoicePipeline()
    
    result = await pipeline.process(audio_samples["clean_speech"])
    
    assert result["transcription"] is not None
    assert result["response"] is not None
    assert result["audio"] is not None
    assert len(result["audio"]) > 0

@pytest.mark.asyncio
async def test_handles_noisy_input(audio_samples):
    pipeline = VoicePipeline(config={"noise_reduction": True})
    
    result = await pipeline.process(audio_samples["noisy_speech"])
    
    # Should still produce valid output despite noise
    assert result["transcription"] is not None

@pytest.mark.asyncio
async def test_conversation_continuity():
    pipeline = VoicePipeline()
    
    # Multiple exchanges should maintain context
    r1 = await pipeline.process("Hello")
    r2 = await pipeline.process("What was my previous message?")
    
    # The model should reference the prior exchange
    assert "hello" in r2["response"].lower() or "previous" in r2["response"].lower()

Property-Based Testing

from hypothesis import given, strategies as st

class TestAudioProperties:
    @given(audio=st.audio(min_samples=1600, max_samples=48000))
    def test_normalizes_output(self, audio):
        tts = CoquiTTS()
        output = tts.synthesize("Test")
        
        assert -1.0 <= output.min() <= 1.0
        assert -1.0 <= output.max() <= 1.0
    
    @given(text=st.text(min_length=1, max_length=500))
    def test_handles_various_text_lengths(self, text):
        tts = CoquiTTS()
        output = tts.synthesize(text)
        
        # Output should be proportional to input length
        assert len(output) > 0
        assert isinstance(output, np.ndarray)