KEY INSIGHT
Compression choices should account for target hardware characteristics; different devices favor different compression strategies for optimal inference performance.
Not all compression techniques improve performance equally across hardware platforms. A 4-bit quantized model may be faster on GPUs with native int8 support but slower on CPUs without vectorized int4 operations.
### Hardware Profiling
```python
import time
import torch
class HardwareProfiler:
def __init__(self, device):
self.device = device
self.results = {}
def profile_operation(self, op_name, fn, *args, **kwargs):
"""
Profile execution time and memory usage of an operation.
"""
if self.device == 'cuda':
torch.cuda.reset_peak_memory_stats()
torch.cuda.synchronize()
start_time = time.perf_counter()
result = fn(*args, **kwargs)
if self.device == 'cuda':
torch.cuda.synchronize()
end_time = time.perf_counter()
memory_mb = 0
if self.device == 'cuda':
memory_mb = torch.cuda.max_memory_allocated() / 1e6
self.results[op_name] = {
'time_ms': (end_time - start_time) * 1000,
'memory_mb': memory_mb
}
return result
def report(self):
for op, metrics in self.results.items():
print(f"{op}: {metrics['time_ms']:.2f}ms, {metrics['memory_mb']:.2f}MB")
```
### Hardware-Specific Optimization
Different targets require different strategies:
```python
class HardwareAwareCompressor:
def recommend_strategy(self, target_device):
"""
Recommend compression strategy based on hardware.
"""
strategies = {
'nvidia_gpu': {
'quantization_bits': 8, # INT8 tensor cores available
'pruning_type': 'structured', # Better memory access patterns
'layout': 'NCHW', # Optimized for convolution
'precision': 'fp16' # Tensor core compatible
},
'cpu': {
'quantization_bits': 16, # AVX2 may not have efficient int8
'pruning_type': 'unstructured', # More flexibility
'layout': 'NHWC', # Better cache utilization
'precision': 'bf16' # Better numerical stability on CPU
},
'mobile_npu': {
'quantization_bits': 8, # Fixed-function accelerators
'pruning_type': 'channel', # Matches fixed hardware shapes
'layout': 'NCHW', # Typical for mobile processors
'precision': 'int8' # Hardware natively supports
},
'embedded_mcu': {
'quantization_bits': 4, # Minimal memory
'pruning_type': 'structured', # Predictable access patterns
'layout': 'NCHW',
'precision': 'int4' # Smallest representable
}
}
return strategies.get(target_device, strategies['cpu'])
```
### Benchmark-Based Selection
```python
def benchmark_compression_strategies(model, test_input, target_device):
"""
Benchmark multiple compression strategies on target hardware.
"""
strategies = [
{'bits': 8, 'pruning': 0.5, 'method': 'int8_quantize'},
{'bits': 8, 'pruning': 0.7, 'method': 'int8_quantize'},
{'bits': 4, 'pruning': 0.5, 'method': 'int4_quantize'},
{'bits': 16, 'pruning': 0.5, 'method': 'bf16_quantize'},
]
results = []
for strategy in strategies:
compressed = apply_compression(model, strategy)
# Warm-up runs
for _ in range(3):
compressed(test_input)
# Timed runs
times = []
for _ in range(10):
start = time.perf_counter()
output = compressed(test_input)
if target_device == 'cuda':
torch.cuda.synchronize()
times.append(time.perf_counter() - start)
results.append({
'strategy': strategy,
'mean_latency_ms': np.mean(times) * 1000,
'std_ms': np.std(times) * 1000,
'accuracy': evaluate(compressed, test_loader)
})
return sorted(results, key=lambda x: x['mean_latency_ms'])
```
### Memory Bandwidth Considerations
Compression effectiveness depends on memory bandwidth constraints:
```python
def analyze_memory_bottleneck(model, input_shape):
"""
Analyze whether model is compute-bound or memory-bound.
Determines which compression helps most.
"""
# Count memory accesses per operation
input_tensor = torch.randn(input_shape).cuda()
model = model.cuda()
model.eval()
activations_memory = 0
for module in model.modules():
if isinstance(module, nn.Conv2d):
# Memory for output activation
out_h = input_tensor.shape[2] // module.stride[0]
out_w = input_tensor.shape[3] // module.stride[1]
activations_memory += out_h * out_w * module.out_channels * 4
# Compute-to-memory ratio
total_params = sum(p.numel() for p in model.parameters())
compute_ops = sum(
m.weight.numel() * input_tensor.shape[2] // m.stride[0]
for m in model.modules() if isinstance(m, nn.Conv2d)
)
ratio = compute_ops / (total_params + activations_memory)
if ratio < 1.0:
print("Memory-bound: Focus on reducing model size (pruning, quantization)")
else:
print("Compute-bound: Focus on reducing compute (architecture changes)")
return ratio
```
### Device-Specific Failure Modes
| Device | Common Failure | Mitigation |
|--------|---------------|------------|
| GPU | Unstructured pruning causes irregular memory access | Use structured pruning patterns (N:M) |
| CPU | int4 quantization without hardware support | Stay at int8 or use CPU-specific kernels |
| Mobile NPU | Pruning changes tensor shapes | Use channel pruning to preserve shapes |
| MCU | Quantization noise accumulation | Use symmetric quantization, reduce bit width gradually |