20. Benchmarking Custom Architecture
Chapter 20 of 24 · 30 min
Rigorous benchmarking distinguishes good architectures from marketing claims. This chapter covers building reliable evaluation pipelines.
Benchmarking Infrastructure
import time
import torch
from typing import Dict, List, Optional
from dataclasses import dataclass
from contextlib import contextmanager
@dataclass
class BenchmarkResult:
name: str
throughput: float # tokens/sec
latency_p50: float # ms
latency_p95: float # ms
latency_p99: float # ms
memory_gb: float
flops: float
class ArchitectureBenchmark:
"""
Benchmark suite for custom architectures.
"""
def __init__(self, model: nn.Module, device='cuda'):
self.model = model
self.device = device
self.model.eval()
@contextmanager
def measure_memory(self):
"""Context manager for memory measurement"""
torch.cuda.reset_peak_memory_stats()
start_mem = torch.cuda.memory_allocated()
yield
peak_mem = torch.cuda.max_memory_allocated()
self._peak_mem = (peak_mem - start_mem) / 1e9
def benchmark_generation(self, prompt_tokens: int, generation_length: int,
num_runs=20) -> BenchmarkResult:
"""
Benchmark autoregressive generation.
Measures throughput, latency, and memory.
"""
# Prepare input
x = torch.randint(0, 32000, (1, prompt_tokens), device=self.device)
# Warmup
for _ in range(5):
_ = self.model(x)
x = torch.cat([x, torch.randint(0, 32000, (1, 1), device=self.device)], dim=1)
# Reset input for actual benchmark
x = torch.randint(0, 32000, (1, prompt_tokens), device=self.device)
latencies = []
total_tokens = 0
with torch.no_grad(), torch.cuda.amp.autocast(dtype=torch.bfloat16):
for run in range(num_runs):
torch.cuda.synchronize()
start = time.perf_counter()
# Generate tokens one at a time (worst case latency)
for _ in range(generation_length):
with torch.no_grad():
logits = self.model(x)
next_token = logits[:, -1:].argmax(dim=-1)
x = torch.cat([x, next_token], dim=1)
torch.cuda.synchronize()
end = time.perf_counter()
latencies.append(end - start)
total_tokens += generation_length
# Compute statistics
total_time = sum(latencies)
throughput = total_tokens / total_time
sorted_lat = sorted(latencies)
return BenchmarkResult(
name=f"gen_p{prompt_tokens}_l{generation_length}",
throughput=throughput,
latency_p50=sorted_lat[len(sorted_lat)//2] * 1000,
latency_p95=sorted_lat[int(len(sorted_lat)*0.95)] * 1000,
latency_p99=sorted_lat[int(len(sorted_lat)*0.99)] * 1000,
memory_gb=self._peak_mem,
flops=estimate_flops(self.model, prompt_tokens, generation_length)
)
def benchmark_batch_throughput(self, batch_size: int, seq_len: int,
num_runs=50) -> BenchmarkResult:
"""
Benchmark batched inference throughput.
"""
x = torch.randint(0, 32000, (batch_size, seq_len), device=self.device)
# Warmup
for _ in range(5):
with torch.no_grad():
_ = self.model(x)
times = []
with torch.no_grad(), torch.cuda.amp.autocast(dtype=torch.bfloat16), \
self.measure_memory():
for _ in range(num_runs):
torch.cuda.synchronize()
start = time.perf_counter()
with torch.no_grad():
_ = self.model(x)
torch.cuda.synchronize()
end = time.perf_counter()
times.append(end - start)
total_time = sum(times)
throughput = (batch_size * seq_len * num_runs) / total_time
return BenchmarkResult(
name=f"batch_b{batch_size}_s{seq_len}",
throughput=throughput,
latency_p50=sorted(times)[len(times)//2] * 1000,
latency_p95=sorted(times)[int(len(times)*0.95)] * 1000,
latency_p99=sorted(times)[int(len(times)*0.99)] * 1000,
memory_gb=getattr(self, '_peak_mem', 0),
flops=batch_size * seq_len * estimate_layer_flops(self.model)
)
def estimate_flops(model, prompt_len, gen_len):
"""Estimate FLOPs for generation"""
# Approximate: each token requires ~2 * params FLOPs
params = sum(p.numel() for p in model.parameters())
return params * 2 * gen_len
def estimate_layer_flops(model):
"""Estimate FLOPs per layer for single forward pass"""
# This is architecture-specific
return 0 # Placeholder
Quality Benchmarking
import numpy as np
from typing import List, Dict
class QualityBenchmark:
"""
Benchmark model quality on standard tasks.
"""
def __init__(self, model: nn.Module, tokenizer):
self.model = model
self.tokenizer = tokenizer
def evaluate_task(self, task: str, dataset: List[Dict]) -> Dict:
"""
Evaluate on a specific task (MMLU, HellaSwag, etc.)
"""
if task == 'mmlu':
return self._eval_mmlu(dataset)
elif task == 'hellaswag':
return self._eval_hellaswag(dataset)
elif task == 'humaneval':
return self._eval_humaneval(dataset)
else:
raise ValueError(f"Unknown task: {task}")
def _eval_mmlu(self, dataset: List[Dict]) -> Dict:
"""Evaluate on MMLU multiple choice"""
correct = 0
total = 0
for item in dataset:
prompt = item['prompt']
answer_idx = item['answer']
choices = item['choices']
# Construct prompt with choices
full_prompt = f"{prompt}\nA: {choices[0]}\nB: {choices[1]}\nC: {choices[2]}\nD: {choices[3]}"
# Get model prediction
inputs = self.tokenizer(full_prompt, return_tensors='pt').to(self.model.device)
with torch.no_grad():
logits = self.model(**inputs).logits
# Get logits for A, B, C, D tokens
pred = logits[0, -1].argmax().item()
if pred == answer_idx:
correct += 1
total += 1
return {'accuracy': correct / total, 'correct': correct, 'total': total}
def _eval_humaneval(self, dataset: List[Dict]) -> Dict:
"""Evaluate on HumanEval (pass@k)"""
results = []
for item in dataset:
prompt = item['prompt']
test = item['test']
# Generate completion
inputs = self.tokenizer(prompt, return_tensors='pt').to(self.model.device)
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=200,
temperature=0.2,
top_p=0.95,
num_return_sequences=10 # For pass@k
)
completions = [
self.tokenizer.decode(outputs[i], skip_special_tokens=True)
for i in range(outputs.shape[0])
]
# Check if any completion passes tests
passed = any(self._check_function(completion, test) for completion in completions)
results.append(passed)
# Compute pass@k
n = len(results)
k = 10
c = sum(results)
# pass@k estimate
pass_at_k = 1 - (math.comb(n - c, k) / math.comb(n, k))
return {'pass_at_k': pass_at_k, 'n': n}
def _check_function(self, code: str, test: str) -> bool:
"""Check if code passes tests (simplified)"""
# In practice, use execution environment
return False
Comparative Benchmarking
def compare_architectures(architectures: Dict[str, nn.Module],
benchmark_config: Dict) -> pd.DataFrame:
"""
Compare multiple architectures on same benchmark suite.
"""
results = []
for name, model in architectures.items():
print(f"Benchmarking {name}...")
benchmark = ArchitectureBenchmark(model)
quality = QualityBenchmark(model, benchmark_config['tokenizer'])
# Throughput benchmarks
for config in benchmark_config['throughput_configs']:
result = benchmark.benchmark_batch_throughput(
batch_size=config['batch'],
seq_len=config['seq_len']
)
results.append({
'architecture': name,
'type': 'throughput',
'config': f"b{config['batch']}_s{config['seq_len']}",
'throughput': result.throughput,
'latency_p50': result.latency_p50,
'memory_gb': result.memory_gb
})
# Quality benchmarks
for task, dataset in benchmark_config['tasks'].items():
q_result = quality.evaluate_task(task, dataset)
results.append({
'architecture': name,
'type': 'quality',
'config': task,
'accuracy': q_result.get('accuracy', 0),
'pass_at_k': q_result.get('pass_at_k', 0)
})
return pd.DataFrame(results)
Failure Mode: Benchmark Noise
# BUG: Not accounting for GPU warmup and variance
def broken_benchmark(model, num_runs=10):
"""Benchmark without warmup - results are noisy"""
times = []
for _ in range(num_runs):
start = time.perf_counter()
_ = model(x)
end = time.perf_counter()
times.append(end - start)
return sum(times) / len(times) # No warmup, noisy results
# FIX: Proper benchmark with warmup and variance tracking
def correct_benchmark(model, num_warmup=20, num_runs=100):
"""Proper benchmark with warmup and statistical rigor"""
# Warmup to reach steady state
for _ in range(num_warmup):
_ = model(x)
torch.cuda.synchronize()
times = []
for _ in range(num_runs):
start = time.perf_counter()
_ = model(x)
torch.cuda.synchronize()
end = time.perf_counter()
times.append(end - start)
times = np.array(times)
return {
'mean': times.mean(),
'std': times.std(),
'p50': np.median(times),
'p95': np.percentile(times, 95),
'p99': np.percentile(times, 99)
}
EXERCISE
Benchmark your custom architecture against a baseline (e.g., Llama) using both throughput and quality metrics. Create a summary table with speedup ratios and accuracy differences for each task.