19. Testing Voice Pipelines
Chapter 19 of 22 · 25 min
Testing voice AI systems requires audio-specific fixtures, latency assertions, and quality validation beyond standard unit testing.
Audio Test Fixtures
import pytest
import numpy as np
from pathlib import Path
@pytest.fixture
def audio_samples():
return {
"clean_speech": load_audio("tests/fixtures/clean_speech.wav"),
"noisy_speech": load_audio("tests/fixtures/noisy_speech.wav"),
"silence": load_audio("tests/fixtures/silence.wav"),
"multi_speaker": load_audio("tests/fixtures/multi_speaker.wav")
}
@pytest.fixture
def mock_microphone(audio_samples):
from unittest.mock import MagicMock
mic = MagicMock()
mic.read.return_value = audio_samples["clean_speech"].tobytes()
return mic
def load_audio(path: str) -> np.ndarray:
import soundfile as sf
return sf.read(path)[0]
@pytest.fixture
def temp_audio_file():
content = np.random.randint(-1000, 1000, size=16000, dtype=np.int16)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
sf.write(f.name, content, 16000)
yield f.name
os.unlink(f.name)
Unit Tests for Components
class TestNoiseReduction:
def test_reduces_noise_floor(self, audio_samples):
reducer = NoiseReductionPipeline()
output = reducer.process(audio_samples["noisy_speech"])
noise_floor_before = np.mean(np.abs(audio_samples["noisy_speech"]))
noise_floor_after = np.mean(np.abs(output))
assert noise_floor_after < noise_floor_before * 0.8
def test_preserves_speech_quality(self, audio_samples):
reducer = NoiseReductionPipeline()
output = reducer.process(audio_samples["clean_speech"])
correlation = np.corrcoef(
audio_samples["clean_speech"].astype(float),
output.astype(float)
)[0, 1]
assert correlation > 0.9
class TestLanguageDetection:
@pytest.mark.asyncio
async def test_detects_english(self, audio_samples):
detector = LanguageDetector()
lang = await detector.detect(audio_samples["english_speech"])
assert lang == "en"
@pytest.mark.asyncio
async def test_detects_spanish(self, audio_samples):
detector = LanguageDetector()
lang = await detector.detect(audio_samples["spanish_speech"])
assert lang == "es"
class TestTTS:
def test_output_shape(self):
tts = CoquiTTS()
audio = tts.synthesize("Hello world")
assert audio.dtype == np.float32
assert len(audio) > 16000 # At least 1 second
assert -1.0 <= audio.min() <= 1.0
assert -1.0 <= audio.max() <= 1.0
Latency Testing
import time
class TestLatency:
@pytest.mark.asyncio
async def test_end_to_end_latency(self, audio_samples):
pipeline = VoicePipeline()
start = time.perf_counter()
result = await pipeline.process(audio_samples["clean_speech"])
latency_ms = (time.perf_counter() - start) * 1000
assert latency_ms < 500, f"Latency {latency_ms}ms exceeds 500ms threshold"
@pytest.mark.asyncio
async def test_component_latencies(self, audio_samples):
pipeline = VoicePipeline()
with LatencyTracker() as tracker:
await pipeline.process(audio_samples["clean_speech"])
report = tracker.get_report()
assert report["asr"] < 200, f"ASR took {report['asr']}ms"
assert report["llm"] < 300, f"LLM took {report['llm']}ms"
assert report["tts"] < 150, f"TTS took {report['tts']}ms"
@pytest.fixture
def latency_tracker():
return LatencyTracker()
class LatencyTracker:
def __init__(self):
self.measurements = {}
self.start_times = {}
def __enter__(self):
return self
def start(self, component: str):
self.start_times[component] = time.perf_counter()
def end(self, component: str):
if component in self.start_times:
elapsed = (time.perf_counter() - self.start_times[component]) * 1000
self.measurements[component] = elapsed
def get_report(self) -> dict:
return self.measurements.copy()
Integration Tests
@pytest.mark.asyncio
async def test_full_pipeline(audio_samples):
pipeline = VoicePipeline()
result = await pipeline.process(audio_samples["clean_speech"])
assert result["transcription"] is not None
assert result["response"] is not None
assert result["audio"] is not None
assert len(result["audio"]) > 0
@pytest.mark.asyncio
async def test_handles_noisy_input(audio_samples):
pipeline = VoicePipeline(config={"noise_reduction": True})
result = await pipeline.process(audio_samples["noisy_speech"])
# Should still produce valid output despite noise
assert result["transcription"] is not None
@pytest.mark.asyncio
async def test_conversation_continuity():
pipeline = VoicePipeline()
# Multiple exchanges should maintain context
r1 = await pipeline.process("Hello")
r2 = await pipeline.process("What was my previous message?")
# The model should reference the prior exchange
assert "hello" in r2["response"].lower() or "previous" in r2["response"].lower()
Property-Based Testing
from hypothesis import given, strategies as st
class TestAudioProperties:
@given(audio=st.audio(min_samples=1600, max_samples=48000))
def test_normalizes_output(self, audio):
tts = CoquiTTS()
output = tts.synthesize("Test")
assert -1.0 <= output.min() <= 1.0
assert -1.0 <= output.max() <= 1.0
@given(text=st.text(min_length=1, max_length=500))
def test_handles_various_text_lengths(self, text):
tts = CoquiTTS()
output = tts.synthesize(text)
# Output should be proportional to input length
assert len(output) > 0
assert isinstance(output, np.ndarray)
EXERCISE
Write a test suite for a TTS model that verifies: (1) output is normalized between -1 and 1, (2) synthesis latency is under 500ms, and (3) output duration scales linearly with input text length. Time: 15 minutes.