17. Evaluation Benchmarks
Chapter 17 of 18 · 15 min
Benchmarks provide standardized tests for measuring safety properties. A good benchmark covers diverse risk categories, includes adversarial examples, and resists gaming.
Common Safety Benchmarks
| Benchmark | Focus | Size |
|---|---|---|
| TruthfulQA | Truthfulness under misinformation | 817 |
| RealToxicityPrompts | Toxic generation | 100K |
| BIG-bench Safety | Multi-domain adversarial | 50 categories |
| HELM Safety | thorough capability/safety | Modular |
Building a Custom Benchmark
import json
from typing import Callable
class SafetyBenchmark:
"""Framework for creating and running safety benchmarks."""
def __init__(self, name: str):
self.name = name
self.test_cases = []
self.results = []
def add_test_case(
self,
category: str,
prompt: str,
expected_behavior: str,
is_harmful: bool,
metadata: dict = None
):
"""Add a single test case to the benchmark."""
self.test_cases.append({
'category': category,
'prompt': prompt,
'expected_behavior': expected_behavior,
'is_harmful': is_harmful,
'metadata': metadata or {}
})
def run(
self,
model: Callable,
evaluator: Callable,
max_cases: int = None
) -> dict:
"""Run benchmark against a model."""
cases = self.test_cases[:max_cases] if max_cases else self.test_cases
self.results = []
for case in cases:
response = model(case['prompt'])
evaluation = evaluator(response, case)
self.results.append({
**case,
'response': response,
'evaluation': evaluation,
'correct': evaluation['safe'] != case['is_harmful']
})
return self.compute_metrics()
def compute_metrics(self) -> dict:
"""Compute aggregate metrics across all test cases."""
by_category = {}
for result in self.results:
cat = result['category']
if cat not in by_category:
by_category[cat] = {'correct': 0, 'total': 0}
by_category[cat]['total'] += 1
if result['correct']:
by_category[cat]['correct'] += 1
return {
'overall_accuracy': sum(r['correct'] for r in self.results) / len(self.results),
'by_category': {
cat: acc['correct'] / acc['total']
for cat, acc in by_category.items()
},
'false_positive_rate': sum(
not r['correct'] and r['is_harmful']
for r in self.results
) / sum(r['is_harmful'] for r in self.results),
'false_negative_rate': sum(
not r['correct'] and not r['is_harmful']
for r in self.results
) / sum(not r['is_harmful'] for r in self.results)
}
def save(self, path: str):
"""Export benchmark results to JSON."""
with open(path, 'w') as f:
json.dump({
'name': self.name,
'results': self.results,
'metrics': self.compute_metrics()
}, f, indent=2)
EXERCISE
Construct a small safety benchmark (20 cases across 4 categories) for a domain of your choice. Run it against an API model and analyze where the model fails most frequently.