Observability — Custom Agent Frameworks (Chapter 16)

If you can't measure it, you can't improve it. Agent systems are notorious for being opaque—requests enter and responses exit, but the middle is a black box.

The Three Pillars

Log aggregation, metrics collection, and distributed tracing work together. Missing any one creates blind spots.

Structured Logging

Unstructured logs are useless for debugging production systems:

import logging
import json
from datetime import datetime
from contextvars import ContextVar

correlation_id: ContextVar[str] = ContextVar('correlation_id', default='')

class StructuredFormatter(logging.Formatter):
    def format(self, record: logging.LogRecord) -> str:
        log_data = {
            "timestamp": datetime.utcnow().isoformat() + "Z",
            "level": record.levelname,
            "message": record.getMessage(),
            "logger": record.name,
            "correlation_id": correlation_id.get(),
        }
        # Add extra fields
        if hasattr(record, 'agent_id'):
            log_data['agent_id'] = record.agent_id
        if hasattr(record, 'duration_ms'):
            log_data['duration_ms'] = record.duration_ms
        
        return json.dumps(log_data)

# Usage
logger = logging.getLogger("agent.pipeline")
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setFormatter(StructuredFormatter())
logger.addHandler(handler)

Metrics That Matter

Track these metrics for every agent:

from dataclasses import dataclass
import time

@dataclass
class AgentMetrics:
    agent_id: str
    requests_received: int = 0
    requests_completed: int = 0
    requests_failed: int = 0
    total_latency_ms: float = 0.0
    
    def record_request(self, duration_ms: float, success: bool):
        self.requests_received += 1
        if success:
            self.requests_completed += 1
        else:
            self.requests_failed += 1
        self.total_latency_ms += duration_ms
    
    @property
    def avg_latency_ms(self) -> float:
        if self.requests_completed == 0:
            return 0.0
        return self.total_latency_ms / self.requests_completed
    
    @property
    def error_rate(self) -> float:
        if self.requests_received == 0:
            return 0.0
        return self.requests_failed / self.requests_received

Distributed Tracing

When a request spans multiple agents, trace IDs let you correlate events:

from uuid import uuid4

def create_trace_context() -> dict:
    return {
        "trace_id": correlation_id.get() or str(uuid4()),
        "span_id": str(uuid4())[:8],
        "timestamp": time.time()
    }

async def trace_operation(operation_name: str, agent_id: str):
    start = time.time()
    logger.info(f"Starting {operation_name}", extra={"agent_id": agent_id})
    try:
        yield
    finally:
        duration_ms = (time.time() - start) * 1000
        logger.info(f"Completed {operation_name}", extra={
            "agent_id": agent_id,
            "duration_ms": round(duration_ms, 2)
        })