06. Audio-Visual Integration
Sound and vision provide complementary perspectives on the same events. Audio-visual models learn from the natural synchronization between what we see and hear.
Why Audio and Vision Complement
Audio captures information invisible to cameras: off-screen sounds, speaker identity, emotional tone, acoustic environment. Vision captures spatial layout, occluded objects, and visual context. Combined, they resolve ambiguities neither modality alone can.
import librosa
import torch
class AudioVisualEncoder(torch.nn.Module):
def __init__(self, audio_dim=128, vision_dim=768, fusion_dim=512):
super().__init__()
# Audio encoder: spectrogram-based
self.audio_conv = torch.nn.Sequential(
torch.nn.Conv2d(1, 32, kernel_size=3, stride=2),
torch.nn.ReLU(),
torch.nn.Conv2d(32, 64, kernel_size=3, stride=2),
torch.nn.ReLU(),
torch.nn.AdaptiveAvgPool2d((4, 4))
)
self.audio_proj = torch.nn.Linear(64 * 16, audio_dim)
# Vision encoder: frame-based
self.vision_proj = torch.nn.Linear(vision_dim, fusion_dim)
self.audio_proj = torch.nn.Linear(64 * 16, fusion_dim)
# Cross-modal attention
self.cross_attention = CrossAttention(
query_dim=fusion_dim,
key_dim=fusion_dim,
num_heads=8
)
def forward(self, spectrogram, vision_features):
# spectrogram: (B, 1, Freq, Time)
# vision_features: (B, T, vision_dim)
audio_features = self.audio_conv(spectrogram)
audio_features = audio_features.flatten(2) # (B, C, T)
audio_features = audio_features.permute(0, 2, 1) # (B, T, C)
audio_features = self.audio_proj(audio_features)
# Vision features already (B, T, fusion_dim)
# Cross-attend: vision queries audio
fused = self.cross_attention(
query=vision_features,
key=audio_features,
value=audio_features
)
return fused
Lip Reading and Speech Recognition
The mouth movements visible in video provide redundant information about speech. Audio-visual models use this for reliable speech recognition in noisy environments.
def extract_lip_region(frame, face_bbox):
"""Extract mouth region from detected face."""
x1, y1, x2, y2 = face_bbox
# Expand bounding box to include mouth
height = y2 - y1
mouth_y1 = y2 - int(height * 0.3) # Bottom 30% of face
mouth_region = frame[mouth_y1:y2, x1:x2]
# Resize to fixed dimensions for model input
import cv2
mouth_resized = cv2.resize(mouth_region, (88, 88))
return mouth_resized
def lip_read_video(frames, face_detections):
"""Extract lip regions from video frames."""
lip_sequence = []
for frame, bbox in zip(frames, face_detections):
if bbox is not None:
lip = extract_lip_region(frame, bbox)
lip_sequence.append(lip)
return np.stack(lip_sequence) # (T, 88, 88, 3)
Sound Source Localization
Given video, where is the sound coming from? Models learn to localize sound sources by correlating audio spectrograms with visual regions.
def localize_sound(audio_spectrogram, visual_features, visual_spatial):
"""
audio_spectrogram: (B, Freq, Time)
visual_features: (B, H*W, vision_dim)
visual_spatial: (B, H, W, 2) - spatial coordinates
"""
# Compute audio-visual similarity per spatial location
# Audio is replicated across spatial positions
audio_expanded = audio_spectrogram.unsqueeze(1) # (B, 1, Freq, Time)
audio_expanded = audio_expanded.expand(-1, visual_features.size(1), -1, -1)
# Cross-correlate with visual features
similarity_map = torch.einsum(
'bfv,bfwt->bwv', # Wrong dimension ordering
visual_features,
audio_expanded.flatten(2)
)
return similarity_map # Heatmap over spatial locations
Record a 10-second video with clear speech in a quiet environment. Process it to extract mel spectrograms and frame samples. Experiment with aligning them temporally—what is the expected offset between audio sample time and video frame timestamp due to hardware?