17. Testing Function Calling
Chapter 17 of 18 · 25 min
Automated tests ensure function calling behaves correctly across different scenarios. Without tests, changes break functionality in ways that only appear in production. Testing function calling requires verifying model behavior, tool execution, and the integration between them.
Unit Tests for Tools
Test each tool in isolation:
import pytest
from your_tools import CalculatorTool, SearchTool
class TestCalculatorTool:
def setup(self):
self.tool = CalculatorTool()
def test_basic_arithmetic(self):
result = self.tool._run(expression="2 + 2")
assert result["result"] == 4
def test_complex_expression(self):
result = self.tool._run(expression="(10 + 5) * 3")
assert result["result"] == 45
def test_division_by_zero(self):
with pytest.raises(ZeroDivisionError):
self.tool._run(expression="1 / 0")
def test_invalid_expression(self):
with pytest.raises(SyntaxError):
self.tool._run(expression="2 +")
def test_nested_functions(self):
result = self.tool._run(expression="sqrt(abs(-16))")
assert result["result"] == 4
class TestSearchTool:
def setup(self):
self.tool = SearchTool()
self.tool._index_test_data()
def test_exact_match(self):
result = self.tool._run(query="python")
assert len(result["documents"]) > 0
def test_no_results(self):
result = self.tool._run(query="xyznonexistent")
assert len(result["documents"]) == 0
def test_partial_match(self):
result = self.tool._run(query="prog")
assert any("python" in doc["text"] for doc in result["documents"])
Mocking Model Responses
Test integration without a running model:
from unittest.mock import Mock, patch
class MockChatCompletion:
"""Mock OpenAI-compatible chat completion for testing."""
def __init__(self, responses: list[dict]):
self.responses = responses
self.call_count = 0
def create(self, **kwargs):
response = self.responses[self.call_count % len(self.responses)]
self.call_count += 1
return response
def test_tool_call_detection():
"""Test that model responses triggering tools are handled correctly."""
# Create mock that returns a tool call
mock_response = {
"choices": [{
"message": {
"role": "assistant",
"content": None,
"tool_calls": [{
"id": "call_123",
"type": "function",
"function": {
"name": "get_weather",
"arguments": '{"location": "Tokyo"}'
}
}]
}
}]
}
mock_client = MockChatCompletion([mock_response])
# Test dispatcher
dispatcher = ToolDispatcher(ollama_client=mock_client)
dispatcher.register_tool("get_weather", lambda location: {"temp": 22})
result = dispatcher.process("What's the weather in Tokyo?")
assert result["tool_calls"][0]["name"] == "get_weather"
assert result["tool_calls"][0]["params"]["location"] == "Tokyo"
def test_multiple_tool_calls():
"""Test handling multiple tool calls in one response."""
mock_response = {
"choices": [{
"message": {
"tool_calls": [
{"id": "1", "function": {"name": "search", "arguments": "{}"}},
{"id": "2", "function": {"name": "calculate", "arguments": "{}"}}
]
}
}]
}
mock_client = MockChatCompletion([mock_response])
dispatcher = ToolDispatcher(ollama_client=mock_client)
result = dispatcher.process("Search and calculate")
assert len(result["tool_calls"]) == 2
Integration Tests
Test the full pipeline with a running Ollama instance:
import pytest
import requests
@pytest.fixture
def ollama_available():
"""Check if Ollama is running and skip tests if not."""
try:
response = requests.get("http://localhost:11434/api/tags", timeout=5)
if response.status_code != 200:
pytest.skip("Ollama not available")
except:
pytest.skip("Ollama not reachable")
yield
@pytest.mark.integration
def test_full_tool_call_pipeline(ollama_available):
"""Test complete flow from prompt to tool result to final response."""
dispatcher = StreamingToolDispatcher()
executor = AsyncToolExecutor()
# Register test tool
executor.register("get_time", lambda: {"time": "12:00"})
# First call triggers tool
messages = [
{"role": "user", "content": "What time is it?"}
]
tool_call = None
final_response = None
async for event in dispatcher.stream_with_tools(
"llama3.2",
messages,
[{"name": "get_time", "description": "Get current time", ...}],
tool_executor=executor
):
if event["type"] == "tool_call":
tool_call = event
elif event["type"] == "content" and event["text"]:
final_response = event["text"]
assert tool_call is not None
assert "time" in tool_call.get("params", {}).get("result", {})
@pytest.mark.integration
def test_model_generates_valid_json(ollama_available):
"""Test that model generates valid JSON for tool parameters."""
dispatcher = StreamingToolDispatcher()
messages = [{"role": "user", "content": "Get weather for Paris"}]
tools = [{"name": "get_weather", "parameters": {"properties": {"location": {"type": "string"}}}}]
invalid_json_count = 0
for _ in range(10):
async for event in dispatcher.stream_with_tools("llama3.2", messages, tools):
if event["type"] == "tool_call":
try:
json.loads(event["params"])
except json.JSONDecodeError:
invalid_json_count += 1
# Model should rarely generate invalid JSON
assert invalid_json_count < 2, f"Too many invalid JSON responses: {invalid_json_count}/10"
Load Testing
Verify behavior under load:
import locust
from concurrent.futures import ThreadPoolExecutor
class FunctionCallingLoadTest(locust.HttpUser):
@task(10)
def tool_call_request(self):
self.client.post("/chat", json={
"messages": [{"role": "user", "content": "Calculate 15 * 23"}],
"tools": [{"name": "calculator", ...}]
})
@task(1)
def health_check(self):
self.client.get("/health")
# Run with: locust -f load_test.py --host=http://localhost:8000
EXERCISE
Write tests for a tool that make filesystem calls. Include tests for valid paths, path traversal attempts, missing files, and files exceeding size limits. Verify tests catch all security issues.