06. Working Memory
Working memory is the agent's immediate context—the current conversation, recent tool results, and the task at hand. It lives in RAM, changes every step, and resets when the agent stops.
Core operations:
class WorkingMemory:
def __init__(self, max_messages: int = 100):
self.messages: list[Message] = []
self.max_messages = max_messages
self._tool_results: dict[str, Any] = {}
def add_message(self, role: str, content: str) -> None:
self.messages.append(Message(role=role, content=content))
self._enforce_limit()
def add_tool_results(
self,
calls: list[ToolCall],
results: list[ToolResult]
) -> None:
for call, result in zip(calls, results):
self._tool_results[call.id] = result
# Add assistant message with tool call
self.add_message(
role="assistant",
content=None,
tool_call={
"id": call.id,
"name": call.name,
"arguments": call.arguments
}
)
# Add tool result as system message
self.add_message(
role="tool",
content=result.output if result.success else f"Error: {result.error}",
tool_call_id=call.id
)
def get_context(self, max_tokens: Optional[int] = None) -> list[dict]:
"""Return messages formatted for LLM."""
context = []
for msg in self.messages:
context.append(msg.to_llm_format())
return context
def _enforce_limit(self) -> None:
while len(self.messages) > self.max_messages:
self.messages.pop(0)
Message format for LLM input:
@dataclass
class Message:
role: str
content: Optional[str]
tool_call: Optional[dict] = None
tool_call_id: Optional[str] = None
def to_llm_format(self) -> dict:
msg = {"role": self.role}
if self.content is not None:
msg["content"] = self.content
if self.tool_call:
msg["tool_calls"] = [{
"id": self.tool_call["id"],
"type": "function",
"function": {
"name": self.tool_call["name"],
"arguments": json.dumps(self.tool_call["arguments"])
}
}]
if self.tool_call_id:
msg["tool_call_id"] = self.tool_call_id
return msg
Failure mode: message ordering. Tool results must follow their corresponding assistant message immediately. Some LLM APIs are strict about ordering; a tool result appearing before its call causes silent failures or hallucinations.
Failure mode: unbounded growth. With many tool calls, messages accumulate. Implement max_tokens truncation thoughtfully—don't just cut the last N messages, as you may remove critical tool results. A smarter approach keeps the system prompt, last N turns, and references to important earlier results.
Implement a token-aware truncation strategy that preserves system prompt, last 5 messages, and any tool results from the last 20 messages (even if older than 5 messages).