03. Frame Sampling Strategies
How you sample frames from video dramatically affects what your model sees. The right strategy depends on your task, video length, and computational budget.
Uniform Sampling
The simplest approach: grab frames at fixed intervals. This preserves temporal coverage but may miss fast action.
def uniform_sample(video_path, fps_target=1):
container = av.open(video_path)
video_stream = container.streams.video[0]
video_fps = float(video_stream.average_rate)
total_frames = video_stream.duration
# Calculate frame interval
frame_interval = int(video_fps / fps_target)
frames = []
for i, frame in enumerate(container.decode(video=0)):
if i % frame_interval == 0:
frames.append(frame.to_ndarray(format="rgb24"))
return np.stack(frames) # (T, H, W, 3)
Scene-Aware Sampling
Videos contain shots—continuous sequences from a single camera. Uniform sampling may oversample slow scenes and undersample fast cuts.
def scene_detect_sample(video_path, frames_per_scene=2):
"""Sample based on scene cuts using histogram comparison."""
import cv2
cap = cv2.VideoCapture(video_path)
frames = []
prev_hist = None
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
hist = cv2.calcHist([frame], [0], None, [256], [0, 256])
hist = cv2.normalize(hist, hist).flatten()
if prev_hist is not None:
# Bhattacharyya distance for histogram similarity
similarity = cv2.compareHist(prev_hist, hist, cv2.HISTCMP_BHATTACHARYYA)
# Scene cut detected (low similarity)
if similarity > 0.4:
frames.append(frame)
frames.append(frame) # Add second frame from new scene
elif len(frames) % frames_per_scene == 0:
frames.append(frame)
prev_hist = hist
return np.array(frames)
Adaptive Sampling with Importance Weighting
Some frames matter more than others. Action-heavy moments deserve more frames. Dense sampling followed by learned importance scoring addresses this.
def importance_weighted_sample(frames, model, max_frames=16):
"""Use a lightweight model to score frame importance."""
# Extract features with frozen encoder
with torch.no_grad():
features = model.forward_features(frames)
# Score by variance (high variance = more action)
frame_importance = features.var(dim=(1, 2)).mean(dim=-1)
# Select top-k frames
_, top_indices = torch.topk(frame_importance, min(max_frames, len(frames)))
top_indices = sorted(top_indices.tolist())
return frames[top_indices]
Profile the memory usage of loading a 10-minute video at 30 FPS (9000 frames) versus sampling 32 uniform frames. Calculate the reduction factor and identify where memory savings come from.