11. Video Agent
Video agents extend vision agents to temporal sequences. They perceive change over time, anticipate future states, and execute multi-step plans that span multiple frames.
Temporal Planning
Video agents must reason about sequence and timing. An action may take multiple frames to complete, and the world state changes continuously.
class VideoAgent:
def __init__(self, video_encoder, llm, action_space):
self.video_encoder = video_encoder
self.llm = llm
self.action_space = action_space
self.state_history = deque(maxlen=30) # Last 30 frames
def perceive_video_segment(self, frames):
"""Process a segment of video into state representation."""
# frames: (T, H, W, C) numpy array
# Encode all frames
frame_embeddings = []
for frame in frames:
emb = self.video_encoder.encode_frame(frame)
frame_embeddings.append(emb)
# Model temporal relationships
temporal_repr = self.video_encoder.model_temporal(frame_embeddings)
# Extract semantic summary
summary = self.video_encoder.summarize(frames)
return {
"temporal": temporal_repr,
"summary": summary,
"motion": self._detect_motion(frames)
}
def update_state(self, new_frame):
"""Update agent's belief about world state."""
self.state_history.append(new_frame)
if len(self.state_history) >= 8:
segment = np.stack(list(self.state_history)[-8:])
state = self.perceive_video_segment(segment)
return state
return None # Not enough history
def plan_with_temporal_context(self, goal, current_state, history):
"""Generate plan accounting for temporal dynamics."""
prompt = f"""
You are controlling an agent in a dynamic environment.
Recent history (last few observations):
{history}
Current state: {current_state['summary']}
Motion detected: {current_state['motion']}
Goal: {goal}
IMPORTANT: Consider that the environment is changing over time.
- Actions take time to execute
- Moving objects may affect your plan
- You should anticipate future states
What should you do? Respond with action and expected outcome.
"""
response = self.llm.generate(prompt)
return self._parse_temporal_action(response)
def _detect_motion(self, frames):
"""Detect motion patterns in video segment."""
if len(frames) < 2:
return "static"
# Frame difference magnitude
diff = np.abs(frames[1:] - frames[:-1]).mean()
if diff < 1.0:
return "static"
elif diff < 10.0:
return "slow motion"
else:
return "fast motion"
Anticipating Future States
Video agents benefit from predicting what will happen next. This enables proactive planning rather than reactive responses.
class FuturePredictor:
"""Predict future frames given current video and action."""
def __init__(self, model):
self.model = model
def predict_outcome(self, current_frames, planned_action, steps_ahead=5):
"""
Predict what will happen if we execute planned_action.
Returns: predicted future frames and state changes.
"""
# Encode current video
video_tokens = self.model.encode_video(current_frames)
# Encode planned action
action_token = self.model.encode_action(planned_action)
# Predict future tokens
future_tokens = self.model.predict_future(
video_tokens,
action_token,
num_steps=steps_ahead
)
# Decode to frames
future_frames = self.model.decode_tokens(future_tokens)
return {
"frames": future_frames,
"state_change": self._summarize_change(current_frames, future_frames)
}
Failure Mode: Temporal Credit Assignment
When a sequence fails, which action caused the failure? Video agents struggle with this because multiple actions contribute to outcomes, and effects may be delayed.
def diagnose_temporal_credit_assignment(agent, failed_episodes):
"""Analyze where credit assignment fails in failed episodes."""
for episode in failed_episodes:
print(f"\nEpisode {episode['id']}: {episode['goal']}")
for i, (state, action, outcome) in enumerate(episode["trajectory"]):
if outcome["failure"]:
# Attribute failure to nearby actions
candidates = episode["trajectory"][max(0, i-3):i+1]
# Use counterfactual: what if we had taken different actions?
for candidate in reversed(candidates):
counterfactual = agent.simulate(
episode["initial_state"],
episode["trajectory"][:candidate["step"]],
replace_action=candidate["action"]
)
if not counterfactual["failure"]:
print(f" Step {candidate['step']}: Action '{candidate['action']}' "
f"may have caused failure")
break
Record a 30-second video of a simple task (pouring water, folding paper). Annotate key moments. Build a video agent that processes the video and answers questions like "What happened at the 10-second mark?" and "What will happen next?" Evaluate temporal reasoning accuracy.