17. Testing Function Calling

Chapter 17 of 18 · 25 min

Automated tests ensure function calling behaves correctly across different scenarios. Without tests, changes break functionality in ways that only appear in production. Testing function calling requires verifying model behavior, tool execution, and the integration between them.

Unit Tests for Tools

Test each tool in isolation:

import pytest
from your_tools import CalculatorTool, SearchTool

class TestCalculatorTool:
    def setup(self):
        self.tool = CalculatorTool()
    
    def test_basic_arithmetic(self):
        result = self.tool._run(expression="2 + 2")
        assert result["result"] == 4
    
    def test_complex_expression(self):
        result = self.tool._run(expression="(10 + 5) * 3")
        assert result["result"] == 45
    
    def test_division_by_zero(self):
        with pytest.raises(ZeroDivisionError):
            self.tool._run(expression="1 / 0")
    
    def test_invalid_expression(self):
        with pytest.raises(SyntaxError):
            self.tool._run(expression="2 +")
    
    def test_nested_functions(self):
        result = self.tool._run(expression="sqrt(abs(-16))")
        assert result["result"] == 4

class TestSearchTool:
    def setup(self):
        self.tool = SearchTool()
        self.tool._index_test_data()
    
    def test_exact_match(self):
        result = self.tool._run(query="python")
        assert len(result["documents"]) > 0
    
    def test_no_results(self):
        result = self.tool._run(query="xyznonexistent")
        assert len(result["documents"]) == 0
    
    def test_partial_match(self):
        result = self.tool._run(query="prog")
        assert any("python" in doc["text"] for doc in result["documents"])

Mocking Model Responses

Test integration without a running model:

from unittest.mock import Mock, patch

class MockChatCompletion:
    """Mock OpenAI-compatible chat completion for testing."""
    
    def __init__(self, responses: list[dict]):
        self.responses = responses
        self.call_count = 0
    
    def create(self, **kwargs):
        response = self.responses[self.call_count % len(self.responses)]
        self.call_count += 1
        return response

def test_tool_call_detection():
    """Test that model responses triggering tools are handled correctly."""
    
    # Create mock that returns a tool call
    mock_response = {
        "choices": [{
            "message": {
                "role": "assistant",
                "content": None,
                "tool_calls": [{
                    "id": "call_123",
                    "type": "function",
                    "function": {
                        "name": "get_weather",
                        "arguments": '{"location": "Tokyo"}'
                    }
                }]
            }
        }]
    }
    
    mock_client = MockChatCompletion([mock_response])
    
    # Test dispatcher
    dispatcher = ToolDispatcher(ollama_client=mock_client)
    dispatcher.register_tool("get_weather", lambda location: {"temp": 22})
    
    result = dispatcher.process("What's the weather in Tokyo?")
    
    assert result["tool_calls"][0]["name"] == "get_weather"
    assert result["tool_calls"][0]["params"]["location"] == "Tokyo"

def test_multiple_tool_calls():
    """Test handling multiple tool calls in one response."""
    
    mock_response = {
        "choices": [{
            "message": {
                "tool_calls": [
                    {"id": "1", "function": {"name": "search", "arguments": "{}"}},
                    {"id": "2", "function": {"name": "calculate", "arguments": "{}"}}
                ]
            }
        }]
    }
    
    mock_client = MockChatCompletion([mock_response])
    dispatcher = ToolDispatcher(ollama_client=mock_client)
    
    result = dispatcher.process("Search and calculate")
    
    assert len(result["tool_calls"]) == 2

Integration Tests

Test the full pipeline with a running Ollama instance:

import pytest
import requests

@pytest.fixture
def ollama_available():
    """Check if Ollama is running and skip tests if not."""
    try:
        response = requests.get("http://localhost:11434/api/tags", timeout=5)
        if response.status_code != 200:
            pytest.skip("Ollama not available")
    except:
        pytest.skip("Ollama not reachable")
    
    yield

@pytest.mark.integration
def test_full_tool_call_pipeline(ollama_available):
    """Test complete flow from prompt to tool result to final response."""
    
    dispatcher = StreamingToolDispatcher()
    executor = AsyncToolExecutor()
    
    # Register test tool
    executor.register("get_time", lambda: {"time": "12:00"})
    
    # First call triggers tool
    messages = [
        {"role": "user", "content": "What time is it?"}
    ]
    
    tool_call = None
    final_response = None
    
    async for event in dispatcher.stream_with_tools(
        "llama3.2",
        messages,
        [{"name": "get_time", "description": "Get current time", ...}],
        tool_executor=executor
    ):
        if event["type"] == "tool_call":
            tool_call = event
        elif event["type"] == "content" and event["text"]:
            final_response = event["text"]
    
    assert tool_call is not None
    assert "time" in tool_call.get("params", {}).get("result", {})

@pytest.mark.integration  
def test_model_generates_valid_json(ollama_available):
    """Test that model generates valid JSON for tool parameters."""
    
    dispatcher = StreamingToolDispatcher()
    
    messages = [{"role": "user", "content": "Get weather for Paris"}]
    tools = [{"name": "get_weather", "parameters": {"properties": {"location": {"type": "string"}}}}]
    
    invalid_json_count = 0
    
    for _ in range(10):
        async for event in dispatcher.stream_with_tools("llama3.2", messages, tools):
            if event["type"] == "tool_call":
                try:
                    json.loads(event["params"])
                except json.JSONDecodeError:
                    invalid_json_count += 1
    
    # Model should rarely generate invalid JSON
    assert invalid_json_count < 2, f"Too many invalid JSON responses: {invalid_json_count}/10"

Load Testing

Verify behavior under load:

import locust
from concurrent.futures import ThreadPoolExecutor

class FunctionCallingLoadTest(locust.HttpUser):
    @task(10)
    def tool_call_request(self):
        self.client.post("/chat", json={
            "messages": [{"role": "user", "content": "Calculate 15 * 23"}],
            "tools": [{"name": "calculator", ...}]
        })
    
    @task(1)
    def health_check(self):
        self.client.get("/health")

# Run with: locust -f load_test.py --host=http://localhost:8000
EXERCISE

Write tests for a tool that make filesystem calls. Include tests for valid paths, path traversal attempts, missing files, and files exceeding size limits. Verify tests catch all security issues.