Benchmarking Custom Architecture — Custom LLM Architecture Design (Chapter 20)

Rigorous benchmarking distinguishes good architectures from marketing claims. This chapter covers building reliable evaluation pipelines.

Benchmarking Infrastructure

import time
import torch
from typing import Dict, List, Optional
from dataclasses import dataclass
from contextlib import contextmanager

@dataclass
class BenchmarkResult:
    name: str
    throughput: float  # tokens/sec
    latency_p50: float  # ms
    latency_p95: float  # ms
    latency_p99: float  # ms
    memory_gb: float
    flops: float

class ArchitectureBenchmark:
    """
    Benchmark suite for custom architectures.
    """
    def __init__(self, model: nn.Module, device='cuda'):
        self.model = model
        self.device = device
        self.model.eval()
    
    @contextmanager
    def measure_memory(self):
        """Context manager for memory measurement"""
        torch.cuda.reset_peak_memory_stats()
        start_mem = torch.cuda.memory_allocated()
        yield
        peak_mem = torch.cuda.max_memory_allocated()
        self._peak_mem = (peak_mem - start_mem) / 1e9
    
    def benchmark_generation(self, prompt_tokens: int, generation_length: int,
                           num_runs=20) -> BenchmarkResult:
        """
        Benchmark autoregressive generation.
        Measures throughput, latency, and memory.
        """
        # Prepare input
        x = torch.randint(0, 32000, (1, prompt_tokens), device=self.device)
        
        # Warmup
        for _ in range(5):
            _ = self.model(x)
            x = torch.cat([x, torch.randint(0, 32000, (1, 1), device=self.device)], dim=1)
        
        # Reset input for actual benchmark
        x = torch.randint(0, 32000, (1, prompt_tokens), device=self.device)
        
        latencies = []
        total_tokens = 0
        
        with torch.no_grad(), torch.cuda.amp.autocast(dtype=torch.bfloat16):
            for run in range(num_runs):
                torch.cuda.synchronize()
                start = time.perf_counter()
                
                # Generate tokens one at a time (worst case latency)
                for _ in range(generation_length):
                    with torch.no_grad():
                        logits = self.model(x)
                        next_token = logits[:, -1:].argmax(dim=-1)
                        x = torch.cat([x, next_token], dim=1)
                
                torch.cuda.synchronize()
                end = time.perf_counter()
                
                latencies.append(end - start)
                total_tokens += generation_length
        
        # Compute statistics
        total_time = sum(latencies)
        throughput = total_tokens / total_time
        
        sorted_lat = sorted(latencies)
        return BenchmarkResult(
            name=f"gen_p{prompt_tokens}_l{generation_length}",
            throughput=throughput,
            latency_p50=sorted_lat[len(sorted_lat)//2] * 1000,
            latency_p95=sorted_lat[int(len(sorted_lat)*0.95)] * 1000,
            latency_p99=sorted_lat[int(len(sorted_lat)*0.99)] * 1000,
            memory_gb=self._peak_mem,
            flops=estimate_flops(self.model, prompt_tokens, generation_length)
        )
    
    def benchmark_batch_throughput(self, batch_size: int, seq_len: int,
                                  num_runs=50) -> BenchmarkResult:
        """
        Benchmark batched inference throughput.
        """
        x = torch.randint(0, 32000, (batch_size, seq_len), device=self.device)
        
        # Warmup
        for _ in range(5):
            with torch.no_grad():
                _ = self.model(x)
        
        times = []
        with torch.no_grad(), torch.cuda.amp.autocast(dtype=torch.bfloat16), \
             self.measure_memory():
            for _ in range(num_runs):
                torch.cuda.synchronize()
                start = time.perf_counter()
                
                with torch.no_grad():
                    _ = self.model(x)
                
                torch.cuda.synchronize()
                end = time.perf_counter()
                times.append(end - start)
        
        total_time = sum(times)
        throughput = (batch_size * seq_len * num_runs) / total_time
        
        return BenchmarkResult(
            name=f"batch_b{batch_size}_s{seq_len}",
            throughput=throughput,
            latency_p50=sorted(times)[len(times)//2] * 1000,
            latency_p95=sorted(times)[int(len(times)*0.95)] * 1000,
            latency_p99=sorted(times)[int(len(times)*0.99)] * 1000,
            memory_gb=getattr(self, '_peak_mem', 0),
            flops=batch_size * seq_len * estimate_layer_flops(self.model)
        )


def estimate_flops(model, prompt_len, gen_len):
    """Estimate FLOPs for generation"""
    # Approximate: each token requires ~2 * params FLOPs
    params = sum(p.numel() for p in model.parameters())
    return params * 2 * gen_len


def estimate_layer_flops(model):
    """Estimate FLOPs per layer for single forward pass"""
    # This is architecture-specific
    return 0  # Placeholder

Quality Benchmarking

import numpy as np
from typing import List, Dict

class QualityBenchmark:
    """
    Benchmark model quality on standard tasks.
    """
    def __init__(self, model: nn.Module, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
    
    def evaluate_task(self, task: str, dataset: List[Dict]) -> Dict:
        """
        Evaluate on a specific task (MMLU, HellaSwag, etc.)
        """
        if task == 'mmlu':
            return self._eval_mmlu(dataset)
        elif task == 'hellaswag':
            return self._eval_hellaswag(dataset)
        elif task == 'humaneval':
            return self._eval_humaneval(dataset)
        else:
            raise ValueError(f"Unknown task: {task}")
    
    def _eval_mmlu(self, dataset: List[Dict]) -> Dict:
        """Evaluate on MMLU multiple choice"""
        correct = 0
        total = 0
        
        for item in dataset:
            prompt = item['prompt']
            answer_idx = item['answer']
            choices = item['choices']
            
            # Construct prompt with choices
            full_prompt = f"{prompt}\nA: {choices[0]}\nB: {choices[1]}\nC: {choices[2]}\nD: {choices[3]}"
            
            # Get model prediction
            inputs = self.tokenizer(full_prompt, return_tensors='pt').to(self.model.device)
            
            with torch.no_grad():
                logits = self.model(**inputs).logits
            
            # Get logits for A, B, C, D tokens
            pred = logits[0, -1].argmax().item()
            
            if pred == answer_idx:
                correct += 1
            total += 1
        
        return {'accuracy': correct / total, 'correct': correct, 'total': total}
    
    def _eval_humaneval(self, dataset: List[Dict]) -> Dict:
        """Evaluate on HumanEval (pass@k)"""
        results = []
        
        for item in dataset:
            prompt = item['prompt']
            test = item['test']
            
            # Generate completion
            inputs = self.tokenizer(prompt, return_tensors='pt').to(self.model.device)
            
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=200,
                    temperature=0.2,
                    top_p=0.95,
                    num_return_sequences=10  # For pass@k
                )
            
            completions = [
                self.tokenizer.decode(outputs[i], skip_special_tokens=True)
                for i in range(outputs.shape[0])
            ]
            
            # Check if any completion passes tests
            passed = any(self._check_function(completion, test) for completion in completions)
            results.append(passed)
        
        # Compute pass@k
        n = len(results)
        k = 10
        c = sum(results)
        
        # pass@k estimate
        pass_at_k = 1 - (math.comb(n - c, k) / math.comb(n, k))
        
        return {'pass_at_k': pass_at_k, 'n': n}
    
    def _check_function(self, code: str, test: str) -> bool:
        """Check if code passes tests (simplified)"""
        # In practice, use execution environment
        return False

Comparative Benchmarking

def compare_architectures(architectures: Dict[str, nn.Module], 
                         benchmark_config: Dict) -> pd.DataFrame:
    """
    Compare multiple architectures on same benchmark suite.
    """
    results = []
    
    for name, model in architectures.items():
        print(f"Benchmarking {name}...")
        
        benchmark = ArchitectureBenchmark(model)
        quality = QualityBenchmark(model, benchmark_config['tokenizer'])
        
        # Throughput benchmarks
        for config in benchmark_config['throughput_configs']:
            result = benchmark.benchmark_batch_throughput(
                batch_size=config['batch'],
                seq_len=config['seq_len']
            )
            results.append({
                'architecture': name,
                'type': 'throughput',
                'config': f"b{config['batch']}_s{config['seq_len']}",
                'throughput': result.throughput,
                'latency_p50': result.latency_p50,
                'memory_gb': result.memory_gb
            })
        
        # Quality benchmarks
        for task, dataset in benchmark_config['tasks'].items():
            q_result = quality.evaluate_task(task, dataset)
            results.append({
                'architecture': name,
                'type': 'quality',
                'config': task,
                'accuracy': q_result.get('accuracy', 0),
                'pass_at_k': q_result.get('pass_at_k', 0)
            })
    
    return pd.DataFrame(results)

Failure Mode: Benchmark Noise

# BUG: Not accounting for GPU warmup and variance
def broken_benchmark(model, num_runs=10):
    """Benchmark without warmup - results are noisy"""
    times = []
    for _ in range(num_runs):
        start = time.perf_counter()
        _ = model(x)
        end = time.perf_counter()
        times.append(end - start)
    
    return sum(times) / len(times)  # No warmup, noisy results

# FIX: Proper benchmark with warmup and variance tracking
def correct_benchmark(model, num_warmup=20, num_runs=100):
    """Proper benchmark with warmup and statistical rigor"""
    # Warmup to reach steady state
    for _ in range(num_warmup):
        _ = model(x)
    
    torch.cuda.synchronize()
    
    times = []
    for _ in range(num_runs):
        start = time.perf_counter()
        _ = model(x)
        torch.cuda.synchronize()
        end = time.perf_counter()
        times.append(end - start)
    
    times = np.array(times)
    return {
        'mean': times.mean(),
        'std': times.std(),
        'p50': np.median(times),
        'p95': np.percentile(times, 95),
        'p99': np.percentile(times, 99)
    }