KEY INSIGHT
Rigorous benchmarking requires standardized metrics, diverse workloads, and statistical validation to ensure compression results are reproducible and comparable.
Benchmarking compression requires more than simple accuracy measurements. Thorough evaluation covers latency, memory, throughput, and accuracy across multiple conditions.
### Benchmarking Framework
```python
class CompressionBenchmark:
def __init__(self, model, test_data):
self.model = model
self.test_data = test_data
def full_benchmark(self, device='cuda'):
"""
Run thorough benchmark suite.
"""
results = {
'accuracy': self.benchmark_accuracy(),
'latency': self.benchmark_latency(device),
'throughput': self.benchmark_throughput(device),
'memory': self.benchmark_memory(device),
'model_size': self.measure_model_size()
}
return results
def benchmark_accuracy(self):
"""Measure task accuracy with confidence interval."""
self.model.eval()
correct = 0
total = 0
all_preds = []
all_targets = []
with torch.no_grad():
for batch in self.test_data:
inputs = batch['input'].to(self.model.device)
targets = batch['target'].to(self.model.device)
outputs = self.model(inputs)
preds = outputs.argmax(dim=1)
correct += (preds == targets).sum().item()
total += targets.shape[0]
all_preds.extend(preds.cpu().numpy())
all_targets.extend(targets.cpu().numpy())
accuracy = correct / total
# Compute confidence interval
from scipy import stats
n = len(all_preds)
se = np.sqrt(accuracy * (1 - accuracy) / n)
ci = stats.t.interval(0.95, n-1, loc=accuracy, scale=se)
return {
'accuracy': accuracy,
'confidence_interval': (ci[0], ci[1]),
'std_error': se
}
def benchmark_latency(self, device, warmup=10, iterations=100):
"""Measure inference latency with warmup."""
self.model.eval()
if device == 'cuda':
torch.cuda.empty_cache()
test_input = self.test_data[0]['input'].to(device)
# Warmup
for _ in range(warmup):
_ = self.model(test_input)
if device == 'cuda':
torch.cuda.synchronize()
# Measure
times = []
for _ in range(iterations):
start = time.perf_counter()
_ = self.model(test_input)
if device == 'cuda':
torch.cuda.synchronize()
end = time.perf_counter()
times.append((end - start) * 1000) # ms
return {
'mean_ms': np.mean(times),
'p50_ms': np.percentile(times, 50),
'p95_ms': np.percentile(times, 95),
'p99_ms': np.percentile(times, 99),
'std_ms': np.std(times)
}
```
### Throughput Measurement
```python
def benchmark_throughput(self, device, duration_seconds=5):
"""
Measure sustained throughput over a time period.
"""
self.model.eval()
test_input = self.test_data[0]['input'].to(device)
batch_size = test_input.shape[0]
# Warmup
for _ in range(10):
_ = self.model(test_input)
if device == 'cuda':
torch.cuda.synchronize()
# Throughput test
start_time = time.time()
count = 0
while time.time() - start_time < duration_seconds:
_ = self.model(test_input)
count += 1
if device == 'cuda':
torch.cuda.synchronize()
elapsed = time.time() - start_time
return {
'samples_per_second': count * batch_size / elapsed,
'batches_per_second': count / elapsed,
'batch_size': batch_size
}
def benchmark_memory(self, device):
"""Measure peak memory usage."""
if device == 'cuda':
torch.cuda.reset_peak_memory_stats()
self.model.eval()
for batch in self.test_data:
inputs = batch['input'].to(device)
_ = self.model(inputs)
break # Just need one batch for memory measurement
if device == 'cuda':
peak_memory = torch.cuda.max_memory_allocated() / (1024 ** 2) # MB
return {'peak_memory_mb': peak_memory}
return {'peak_memory_mb': None} # CPU measurement not available
```
### Comparative Benchmarking
```python
def compare_compression_methods(original_model, compressed_models, test_data):
"""
Compare multiple compression configurations side-by-side.
"""
benchmark = CompressionBenchmark(original_model, test_data)
baseline = benchmark.full_benchmark()
results = {
'baseline': baseline,
'comparisons': []
}
for name, compressed_model in compressed_models.items():
benchmark = CompressionBenchmark(compressed_model, test_data)
compressed_results = benchmark.full_benchmark()
comparison = {
'name': name,
'results': compressed_results,
'relative': {
'accuracy_delta': compressed_results['accuracy']['accuracy'] -
baseline['accuracy']['accuracy'],
'latency_speedup': baseline['latency']['mean_ms'] /
compressed_results['latency']['mean_ms'],
'size_reduction': baseline['model_size'] /
compressed_results['model_size'],
'memory_reduction': baseline['memory']['peak_memory_mb'] /
compressed_results['memory']['peak_memory_mb']
}
}
results['comparisons'].append(comparison)
return results
```
### Benchmark Reporting
```python
def generate_benchmark_report(results):
"""
Generate human-readable benchmark report.
"""
print("=" * 60)
print("COMPRESSION BENCHMARK REPORT")
print("=" * 60)
baseline = results['baseline']
print(f"\nBaseline Model:")
print(f" Accuracy: {baseline['accuracy']['accuracy']:.4f}")
print(f" Latency (mean): {baseline['latency']['mean_ms']:.2f}ms")
print(f" Model size: {baseline['model_size']:.2f}MB")
print("\n" + "-" * 60)
print(f"{'Method':<20} {'Acc Δ':<10} {'Speedup':<10} {'Size ↓':<10}")
print("-" * 60)
for comp in results['comparisons']:
rel = comp['relative']
print(f"{comp['name']:<20} "
f"{rel['accuracy_delta']:+.4f} "
f"{rel['latency_speedup']:.2f}x "
f"{rel['size_reduction']:.2f}x")
print("=" * 60)
```