Noise Reduction — Voice AI with Local Models (Chapter 14)

Voice pipelines must extract clean speech from noisy environments—background music, HVAC hum, competing speakers. Real-time noise reduction requires spectral filtering and deep learning approaches.

Spectral Subtraction

The classic frequency-domain approach estimates noise during silence segments and subtracts its magnitude from subsequent frames.

import numpy as np

class SpectralSubtraction:
    def __init__(self, frame_size: int = 512, noise_est_frames: int = 10):
        self.frame_size = frame_size
        self.noise_est_frames = noise_est_frames
        self.noise_spectrum = None
        self.frame_count = 0
    
    def estimate_noise(self, frames: np.ndarray):
        self.noise_spectrum = np.mean(np.abs(np.fft.rfft(frames, axis=1))**2, axis=0)
    
    def process(self, audio: np.ndarray) -> np.ndarray:
        frames = self.frame_audio(audio)
        
        if self.noise_spectrum is None:
            self.estimate_noise(frames[:self.noise_est_frames])
        
        magnitude = np.abs(np.fft.rfft(frames, axis=1))
        phase = np.fft.rfft(frames, axis=1) / (magnitude + 1e-8)
        
        subtracted = np.maximum(magnitude**2 - self.noise_spectrum, 0)**0.5
        cleaned = subtracted * np.exp(1j * np.angle(phase))
        
        result = np.fft.irfft(cleaned, axis=1)
        return self.overlap_add(result)
    
    def frame_audio(self, audio: np.ndarray) -> np.ndarray:
        hop = self.frame_size // 2
        frames = []
        for i in range(0, len(audio) - self.frame_size, hop):
            windowed = audio[i:i + self.frame_size] * np.hanning(self.frame_size)
            frames.append(windowed)
        return np.array(frames) if frames else np.array([[]])
    
    def overlap_add(self, frames: np.ndarray) -> np.ndarray:
        output = np.zeros(frames.shape[0] * (self.frame_size // 2) + self.frame_size)
        hop = self.frame_size // 2
        for i, frame in enumerate(frames):
            output[i * hop:i * hop + self.frame_size] += frame
        return output[:len(output) - hop]

Deep Noise Suppression (DNS) Model

Mozilla's DNS model provides real-time noise suppression with minimal latency.

import torch
import torch.nn as nn

class DNSModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv1d(161, 256, kernel_size=5, padding=2)
        self.lstm = nn.LSTM(256, 128, batch_first=True, bidirectional=True)
        self.conv2 = nn.Conv1d(256, 161, kernel_size=5, padding=2)
        self.relu = nn.ReLU()
    
    def forward(self, spectrogram):
        x = self.relu(self.conv1(spectrogram))
        x = x.transpose(1, 2)
        x, _ = self.lstm(x)
        x = x.transpose(1, 2)
        mask = torch.sigmoid(self.conv2(x))
        return spectrogram * mask

class RealTimeNoiseSuppressor:
    def __init__(self, model_path: str, device: str = "cuda"):
        self.device = torch.device(device)
        self.model = DNSModel().to(self.device)
        checkpoint = torch.load(model_path, map_location=self.device)
        self.model.load_state_dict(checkpoint["model"])
        self.model.eval()
    
    @torch.no_grad()
    def suppress(self, audio: np.ndarray) -> np.ndarray:
        spectrogram = self.compute_spectrogram(audio)
        spectrogram_t = torch.from_numpy(spectrogram).float().to(self.device)
        
        mask = self.model(spectrogram_t.unsqueeze(0))
        cleaned = spectrogram_t * mask.squeeze()
        
        return self.istft(cleaned.cpu().numpy())
    
    def compute_spectrogram(self, audio: np.ndarray) -> np.ndarray:
        window = np.hanning(512)
        frames = []
        for i in range(0, len(audio) - 512, 256):
            frames.append(audio[i:i + 512] * window)
        frames = np.array(frames)
        return np.abs(np.fft.rfft(frames, axis=1))
    
    def istft(self, spectrogram: np.ndarray) -> np.ndarray:
        return np.fft.irfft(spectrogram).flatten()[:len(spectrogram) * 256 + 256]

Voice Activity Detection (VAD)

Before noise reduction, VAD determines whether frames contain speech requiring processing.

import silero_vad

class SileroVAD:
    def __init__(self, threshold: float = 0.5):
        self.model = silero_vad.load_model()
        self.threshold = threshold
    
    def is_speech(self, audio: np.ndarray, sample_rate: int = 16000) -> bool:
        audio_tensor = torch.from_numpy(audio).float()
        speech_prob = self.model(audio_tensor, sample_rate).item()
        return speech_prob > self.threshold
    
    async def stream_detect(self, audio_chunks: list[np.ndarray]) -> list[bool]:
        return [self.is_speech(chunk) for chunk in audio_chunks]

Pipeline Integration

class NoiseReductionPipeline:
    def __init__(self, vad_threshold: float = 0.5, noise_reduce: bool = True):
        self.vad = SileroVAD(threshold=vad_threshold)
        self.spectral_sub = SpectralSubtraction()
        self.noise_reduce = noise_reduce
    
    def process(self, audio: np.ndarray) -> np.ndarray:
        if not self.vad.is_speech(audio):
            return np.zeros_like(audio)
        
        if self.noise_reduce:
            return self.spectral_sub.process(audio)
        return audio

Processing every frame adds ~5-10ms latency. Batch processing multiple frames reduces per-frame overhead at the cost of slightly higher latency.