14. Noise Reduction
Voice pipelines must extract clean speech from noisy environments—background music, HVAC hum, competing speakers. Real-time noise reduction requires spectral filtering and deep learning approaches.
Spectral Subtraction
The classic frequency-domain approach estimates noise during silence segments and subtracts its magnitude from subsequent frames.
import numpy as np
class SpectralSubtraction:
def __init__(self, frame_size: int = 512, noise_est_frames: int = 10):
self.frame_size = frame_size
self.noise_est_frames = noise_est_frames
self.noise_spectrum = None
self.frame_count = 0
def estimate_noise(self, frames: np.ndarray):
self.noise_spectrum = np.mean(np.abs(np.fft.rfft(frames, axis=1))**2, axis=0)
def process(self, audio: np.ndarray) -> np.ndarray:
frames = self.frame_audio(audio)
if self.noise_spectrum is None:
self.estimate_noise(frames[:self.noise_est_frames])
magnitude = np.abs(np.fft.rfft(frames, axis=1))
phase = np.fft.rfft(frames, axis=1) / (magnitude + 1e-8)
subtracted = np.maximum(magnitude**2 - self.noise_spectrum, 0)**0.5
cleaned = subtracted * np.exp(1j * np.angle(phase))
result = np.fft.irfft(cleaned, axis=1)
return self.overlap_add(result)
def frame_audio(self, audio: np.ndarray) -> np.ndarray:
hop = self.frame_size // 2
frames = []
for i in range(0, len(audio) - self.frame_size, hop):
windowed = audio[i:i + self.frame_size] * np.hanning(self.frame_size)
frames.append(windowed)
return np.array(frames) if frames else np.array([[]])
def overlap_add(self, frames: np.ndarray) -> np.ndarray:
output = np.zeros(frames.shape[0] * (self.frame_size // 2) + self.frame_size)
hop = self.frame_size // 2
for i, frame in enumerate(frames):
output[i * hop:i * hop + self.frame_size] += frame
return output[:len(output) - hop]
Deep Noise Suppression (DNS) Model
Mozilla's DNS model provides real-time noise suppression with minimal latency.
import torch
import torch.nn as nn
class DNSModel(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv1d(161, 256, kernel_size=5, padding=2)
self.lstm = nn.LSTM(256, 128, batch_first=True, bidirectional=True)
self.conv2 = nn.Conv1d(256, 161, kernel_size=5, padding=2)
self.relu = nn.ReLU()
def forward(self, spectrogram):
x = self.relu(self.conv1(spectrogram))
x = x.transpose(1, 2)
x, _ = self.lstm(x)
x = x.transpose(1, 2)
mask = torch.sigmoid(self.conv2(x))
return spectrogram * mask
class RealTimeNoiseSuppressor:
def __init__(self, model_path: str, device: str = "cuda"):
self.device = torch.device(device)
self.model = DNSModel().to(self.device)
checkpoint = torch.load(model_path, map_location=self.device)
self.model.load_state_dict(checkpoint["model"])
self.model.eval()
@torch.no_grad()
def suppress(self, audio: np.ndarray) -> np.ndarray:
spectrogram = self.compute_spectrogram(audio)
spectrogram_t = torch.from_numpy(spectrogram).float().to(self.device)
mask = self.model(spectrogram_t.unsqueeze(0))
cleaned = spectrogram_t * mask.squeeze()
return self.istft(cleaned.cpu().numpy())
def compute_spectrogram(self, audio: np.ndarray) -> np.ndarray:
window = np.hanning(512)
frames = []
for i in range(0, len(audio) - 512, 256):
frames.append(audio[i:i + 512] * window)
frames = np.array(frames)
return np.abs(np.fft.rfft(frames, axis=1))
def istft(self, spectrogram: np.ndarray) -> np.ndarray:
return np.fft.irfft(spectrogram).flatten()[:len(spectrogram) * 256 + 256]
Voice Activity Detection (VAD)
Before noise reduction, VAD determines whether frames contain speech requiring processing.
import silero_vad
class SileroVAD:
def __init__(self, threshold: float = 0.5):
self.model = silero_vad.load_model()
self.threshold = threshold
def is_speech(self, audio: np.ndarray, sample_rate: int = 16000) -> bool:
audio_tensor = torch.from_numpy(audio).float()
speech_prob = self.model(audio_tensor, sample_rate).item()
return speech_prob > self.threshold
async def stream_detect(self, audio_chunks: list[np.ndarray]) -> list[bool]:
return [self.is_speech(chunk) for chunk in audio_chunks]
Pipeline Integration
class NoiseReductionPipeline:
def __init__(self, vad_threshold: float = 0.5, noise_reduce: bool = True):
self.vad = SileroVAD(threshold=vad_threshold)
self.spectral_sub = SpectralSubtraction()
self.noise_reduce = noise_reduce
def process(self, audio: np.ndarray) -> np.ndarray:
if not self.vad.is_speech(audio):
return np.zeros_like(audio)
if self.noise_reduce:
return self.spectral_sub.process(audio)
return audio
Processing every frame adds ~5-10ms latency. Batch processing multiple frames reduces per-frame overhead at the cost of slightly higher latency.
Implement a pipeline that uses Silero VAD to detect speech segments, applies spectral subtraction to non-speech portions for noise profiling, and outputs only segments classified as speech with >300ms duration. Time: 15 minutes.