Video Agent — Advanced Multi-Modal Systems (Chapter 11)

Video agents extend vision agents to temporal sequences. They perceive change over time, anticipate future states, and execute multi-step plans that span multiple frames.

Temporal Planning

Video agents must reason about sequence and timing. An action may take multiple frames to complete, and the world state changes continuously.

class VideoAgent:
    def __init__(self, video_encoder, llm, action_space):
        self.video_encoder = video_encoder
        self.llm = llm
        self.action_space = action_space
        self.state_history = deque(maxlen=30)  # Last 30 frames
    
    def perceive_video_segment(self, frames):
        """Process a segment of video into state representation."""
        # frames: (T, H, W, C) numpy array
        
        # Encode all frames
        frame_embeddings = []
        for frame in frames:
            emb = self.video_encoder.encode_frame(frame)
            frame_embeddings.append(emb)
        
        # Model temporal relationships
        temporal_repr = self.video_encoder.model_temporal(frame_embeddings)
        
        # Extract semantic summary
        summary = self.video_encoder.summarize(frames)
        
        return {
            "temporal": temporal_repr,
            "summary": summary,
            "motion": self._detect_motion(frames)
        }
    
    def update_state(self, new_frame):
        """Update agent's belief about world state."""
        self.state_history.append(new_frame)
        
        if len(self.state_history) >= 8:
            segment = np.stack(list(self.state_history)[-8:])
            state = self.perceive_video_segment(segment)
            return state
        
        return None  # Not enough history
    
    def plan_with_temporal_context(self, goal, current_state, history):
        """Generate plan accounting for temporal dynamics."""
        
        prompt = f"""
You are controlling an agent in a dynamic environment.

Recent history (last few observations):
{history}

Current state: {current_state['summary']}
Motion detected: {current_state['motion']}

Goal: {goal}

IMPORTANT: Consider that the environment is changing over time.
- Actions take time to execute
- Moving objects may affect your plan
- You should anticipate future states

What should you do? Respond with action and expected outcome.
"""
        
        response = self.llm.generate(prompt)
        return self._parse_temporal_action(response)
    
    def _detect_motion(self, frames):
        """Detect motion patterns in video segment."""
        if len(frames) < 2:
            return "static"
        
        # Frame difference magnitude
        diff = np.abs(frames[1:] - frames[:-1]).mean()
        
        if diff < 1.0:
            return "static"
        elif diff < 10.0:
            return "slow motion"
        else:
            return "fast motion"

Anticipating Future States

Video agents benefit from predicting what will happen next. This enables proactive planning rather than reactive responses.

class FuturePredictor:
    """Predict future frames given current video and action."""
    def __init__(self, model):
        self.model = model
    
    def predict_outcome(self, current_frames, planned_action, steps_ahead=5):
        """
        Predict what will happen if we execute planned_action.
        Returns: predicted future frames and state changes.
        """
        # Encode current video
        video_tokens = self.model.encode_video(current_frames)
        
        # Encode planned action
        action_token = self.model.encode_action(planned_action)
        
        # Predict future tokens
        future_tokens = self.model.predict_future(
            video_tokens,
            action_token,
            num_steps=steps_ahead
        )
        
        # Decode to frames
        future_frames = self.model.decode_tokens(future_tokens)
        
        return {
            "frames": future_frames,
            "state_change": self._summarize_change(current_frames, future_frames)
        }

Failure Mode: Temporal Credit Assignment

When a sequence fails, which action caused the failure? Video agents struggle with this because multiple actions contribute to outcomes, and effects may be delayed.

def diagnose_temporal_credit_assignment(agent, failed_episodes):
    """Analyze where credit assignment fails in failed episodes."""
    
    for episode in failed_episodes:
        print(f"\nEpisode {episode['id']}: {episode['goal']}")
        
        for i, (state, action, outcome) in enumerate(episode["trajectory"]):
            if outcome["failure"]:
                # Attribute failure to nearby actions
                candidates = episode["trajectory"][max(0, i-3):i+1]
                
                # Use counterfactual: what if we had taken different actions?
                for candidate in reversed(candidates):
                    counterfactual = agent.simulate(
                        episode["initial_state"],
                        episode["trajectory"][:candidate["step"]],
                        replace_action=candidate["action"]
                    )
                    
                    if not counterfactual["failure"]:
                        print(f"  Step {candidate['step']}: Action '{candidate['action']}' "
                              f"may have caused failure")
                        break