11. Edge Benchmarking
Chapter 11 of 18 · 20 min
Accurate benchmarking requires controlling for thermal state, background processes, and measurement methodology. Edge devices exhibit high variance in inference times due to thermal throttling, OS scheduling, and memory pressure from concurrent processes.
Benchmarking infrastructure setup:
# Isolate CPU cores for consistent measurement (Linux)
# Add to kernel command line: isolcpus=2,3
# Then pin inference process
taskset -c 2-3 python benchmark.py
# Disable throttling for cold benchmarking
# Warning: only for brief tests, damages hardware if sustained
echo 0 > /sys/devices/system/cpu/cpu0/cpufreq/scaling_min_freq
Python benchmarking with statistics:
import numpy as np
import time
from typing import Callable, Dict, List
class EdgeBenchmark:
def __init__(self, warmup_rounds=10, measurement_rounds=100):
self.warmup = warmup_rounds
self.rounds = measurement_rounds
self.results: List[float] = []
def measure(self, inference_fn: Callable, *args, **kwargs) -> Dict:
# Warmup phase
for _ in range(self.warmup):
inference_fn(*args, **kwargs)
# Measurement phase
self.results = []
for _ in range(self.rounds):
iteration_start = time.perf_counter_ns()
inference_fn(*args, **kwargs)
iteration_end = time.perf_counter_ns()
self.results.append((iteration_end - iteration_start) / 1e6)
return self._compute_stats()
def _compute_stats(self) -> Dict:
self.results.sort()
return {
"mean_ms": np.mean(self.results),
"std_ms": np.std(self.results),
"min_ms": self.results[0],
"max_ms": self.results[-1],
"p50_ms": np.percentile(self.results, 50),
"p90_ms": np.percentile(self.results, 90),
"p95_ms": np.percentile(self.results, 95),
"p99_ms": np.percentile(self.results, 99),
}
# Usage with ONNX Runtime
benchmark = EdgeBenchmark(warmup_rounds=20, measurement_rounds=1000)
session = ort.InferenceSession("model.onnx")
stats = benchmark.measure(
lambda: session.run(None, {"input": np.random.randn(1, 3, 224, 224).astype(np.float32)})
)
print(f"Mean: {stats['mean_ms']:.2f}ms, P95: {stats['p95_ms']:.2f}ms, Std: {stats['std_ms']:.2f}ms")
Memory footprint measurement on Raspberry Pi:
import resource
import psutil
import os
def measure_memory():
"""Different memory metrics for Linux systems"""
process = psutil.Process(os.getpid())
# Resident Set Size (actual physical memory used)
rss_mb = process.memory_info().rss / 1024 / 1024
# Virtual Memory Size
vms_mb = process.memory_info().vms / 1024 / 1024
# Peak memory from resource module
peak_mb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
return {"rss_mb": rss_mb, "vms_mb": vms_mb, "peak_mb": peak_mb}
Thermal benchmarking requires sustained load observation:
import os
def read_cpu_temperature():
"""Read ARM CPU temperature"""
try:
with open('/sys/class/thermal/thermal_zone0/temp', 'r') as f:
return int(f.read()) / 1000.0
except FileNotFoundError:
return None
# Track temperature during inference loop
temps = []
for i in range(100):
session.run(...)
temps.append(read_cpu_temperature())
print(f"Temperature range: {min(temps):.1f}°C - {max(temps):.1f}°C")
Multi-batch benchmarking reveals batching efficiency:
def benchmark_batching(interpreter, batch_sizes=[1, 2, 4, 8, 16]):
results = {}
input_shape = (1, 3,224, 224)
for batch in batch_sizes:
input_data = np.random.randn(batch, *input_shape[1:]).astype(np.float32)
# Measure throughput
start = time.perf_counter()
for _ in range(50):
interpreter.set_tensor(input_index, input_data)
interpreter.invoke()
elapsed = (time.perf_counter() - start) / 50
results[batch] = {
"latency_ms": elapsed * 1000,
"throughput_samples_per_sec": batch / elapsed
}
return results
EXERCISE
Build an edge benchmark suite measuring latency percentiles, memory footprint, and CPU temperature over sustained inference, then publish results as a reproducible notebook.