15. Compression Benchmarking

Chapter 15 of 18 · 25 min

KEY INSIGHT

Rigorous benchmarking requires standardized metrics, diverse workloads, and statistical validation to ensure compression results are reproducible and comparable. Benchmarking compression requires more than simple accuracy measurements. Thorough evaluation covers latency, memory, throughput, and accuracy across multiple conditions. ### Benchmarking Framework ```python class CompressionBenchmark: def __init__(self, model, test_data): self.model = model self.test_data = test_data def full_benchmark(self, device='cuda'): """ Run thorough benchmark suite. """ results = { 'accuracy': self.benchmark_accuracy(), 'latency': self.benchmark_latency(device), 'throughput': self.benchmark_throughput(device), 'memory': self.benchmark_memory(device), 'model_size': self.measure_model_size() } return results def benchmark_accuracy(self): """Measure task accuracy with confidence interval.""" self.model.eval() correct = 0 total = 0 all_preds = [] all_targets = [] with torch.no_grad(): for batch in self.test_data: inputs = batch['input'].to(self.model.device) targets = batch['target'].to(self.model.device) outputs = self.model(inputs) preds = outputs.argmax(dim=1) correct += (preds == targets).sum().item() total += targets.shape[0] all_preds.extend(preds.cpu().numpy()) all_targets.extend(targets.cpu().numpy()) accuracy = correct / total # Compute confidence interval from scipy import stats n = len(all_preds) se = np.sqrt(accuracy * (1 - accuracy) / n) ci = stats.t.interval(0.95, n-1, loc=accuracy, scale=se) return { 'accuracy': accuracy, 'confidence_interval': (ci[0], ci[1]), 'std_error': se } def benchmark_latency(self, device, warmup=10, iterations=100): """Measure inference latency with warmup.""" self.model.eval() if device == 'cuda': torch.cuda.empty_cache() test_input = self.test_data[0]['input'].to(device) # Warmup for _ in range(warmup): _ = self.model(test_input) if device == 'cuda': torch.cuda.synchronize() # Measure times = [] for _ in range(iterations): start = time.perf_counter() _ = self.model(test_input) if device == 'cuda': torch.cuda.synchronize() end = time.perf_counter() times.append((end - start) * 1000) # ms return { 'mean_ms': np.mean(times), 'p50_ms': np.percentile(times, 50), 'p95_ms': np.percentile(times, 95), 'p99_ms': np.percentile(times, 99), 'std_ms': np.std(times) } ``` ### Throughput Measurement ```python def benchmark_throughput(self, device, duration_seconds=5): """ Measure sustained throughput over a time period. """ self.model.eval() test_input = self.test_data[0]['input'].to(device) batch_size = test_input.shape[0] # Warmup for _ in range(10): _ = self.model(test_input) if device == 'cuda': torch.cuda.synchronize() # Throughput test start_time = time.time() count = 0 while time.time() - start_time < duration_seconds: _ = self.model(test_input) count += 1 if device == 'cuda': torch.cuda.synchronize() elapsed = time.time() - start_time return { 'samples_per_second': count * batch_size / elapsed, 'batches_per_second': count / elapsed, 'batch_size': batch_size } def benchmark_memory(self, device): """Measure peak memory usage.""" if device == 'cuda': torch.cuda.reset_peak_memory_stats() self.model.eval() for batch in self.test_data: inputs = batch['input'].to(device) _ = self.model(inputs) break # Just need one batch for memory measurement if device == 'cuda': peak_memory = torch.cuda.max_memory_allocated() / (1024 ** 2) # MB return {'peak_memory_mb': peak_memory} return {'peak_memory_mb': None} # CPU measurement not available ``` ### Comparative Benchmarking ```python def compare_compression_methods(original_model, compressed_models, test_data): """ Compare multiple compression configurations side-by-side. """ benchmark = CompressionBenchmark(original_model, test_data) baseline = benchmark.full_benchmark() results = { 'baseline': baseline, 'comparisons': [] } for name, compressed_model in compressed_models.items(): benchmark = CompressionBenchmark(compressed_model, test_data) compressed_results = benchmark.full_benchmark() comparison = { 'name': name, 'results': compressed_results, 'relative': { 'accuracy_delta': compressed_results['accuracy']['accuracy'] - baseline['accuracy']['accuracy'], 'latency_speedup': baseline['latency']['mean_ms'] / compressed_results['latency']['mean_ms'], 'size_reduction': baseline['model_size'] / compressed_results['model_size'], 'memory_reduction': baseline['memory']['peak_memory_mb'] / compressed_results['memory']['peak_memory_mb'] } } results['comparisons'].append(comparison) return results ``` ### Benchmark Reporting ```python def generate_benchmark_report(results): """ Generate human-readable benchmark report. """ print("=" * 60) print("COMPRESSION BENCHMARK REPORT") print("=" * 60) baseline = results['baseline'] print(f"\nBaseline Model:") print(f" Accuracy: {baseline['accuracy']['accuracy']:.4f}") print(f" Latency (mean): {baseline['latency']['mean_ms']:.2f}ms") print(f" Model size: {baseline['model_size']:.2f}MB") print("\n" + "-" * 60) print(f"{'Method':<20} {'Acc Δ':<10} {'Speedup':<10} {'Size ↓':<10}") print("-" * 60) for comp in results['comparisons']: rel = comp['relative'] print(f"{comp['name']:<20} " f"{rel['accuracy_delta']:+.4f} " f"{rel['latency_speedup']:.2f}x " f"{rel['size_reduction']:.2f}x") print("=" * 60) ```

EXERCISE

Build a benchmarking script that measures inference latency, memory usage, and accuracy for a base model and three compressed variants. Run 100 warmup iterations before collecting measurements. Report mean and P99 latency.