Edge Benchmarking — Edge AI: Mobile and IoT (Chapter 11)

Accurate benchmarking requires controlling for thermal state, background processes, and measurement methodology. Edge devices exhibit high variance in inference times due to thermal throttling, OS scheduling, and memory pressure from concurrent processes.

Benchmarking infrastructure setup:

# Isolate CPU cores for consistent measurement (Linux)
# Add to kernel command line: isolcpus=2,3
# Then pin inference process
taskset -c 2-3 python benchmark.py

# Disable throttling for cold benchmarking
# Warning: only for brief tests, damages hardware if sustained
echo 0 > /sys/devices/system/cpu/cpu0/cpufreq/scaling_min_freq

Python benchmarking with statistics:

import numpy as np
import time
from typing import Callable, Dict, List

class EdgeBenchmark:
    def __init__(self, warmup_rounds=10, measurement_rounds=100):
        self.warmup = warmup_rounds
        self.rounds = measurement_rounds
        self.results: List[float] = []
    
    def measure(self, inference_fn: Callable, *args, **kwargs) -> Dict:
        # Warmup phase
        for _ in range(self.warmup):
            inference_fn(*args, **kwargs)
        
        # Measurement phase
        self.results = []
        for _ in range(self.rounds):
            iteration_start = time.perf_counter_ns()
            inference_fn(*args, **kwargs)
            iteration_end = time.perf_counter_ns()
            self.results.append((iteration_end - iteration_start) / 1e6)
        
        return self._compute_stats()
    
    def _compute_stats(self) -> Dict:
        self.results.sort()
        return {
            "mean_ms": np.mean(self.results),
            "std_ms": np.std(self.results),
            "min_ms": self.results[0],
            "max_ms": self.results[-1],
            "p50_ms": np.percentile(self.results, 50),
            "p90_ms": np.percentile(self.results, 90),
            "p95_ms": np.percentile(self.results, 95),
            "p99_ms": np.percentile(self.results, 99),
        }

# Usage with ONNX Runtime
benchmark = EdgeBenchmark(warmup_rounds=20, measurement_rounds=1000)
session = ort.InferenceSession("model.onnx")

stats = benchmark.measure(
    lambda: session.run(None, {"input": np.random.randn(1, 3, 224, 224).astype(np.float32)})
)

print(f"Mean: {stats['mean_ms']:.2f}ms, P95: {stats['p95_ms']:.2f}ms, Std: {stats['std_ms']:.2f}ms")

Memory footprint measurement on Raspberry Pi:

import resource
import psutil
import os

def measure_memory():
    """Different memory metrics for Linux systems"""
    process = psutil.Process(os.getpid())
    
    # Resident Set Size (actual physical memory used)
    rss_mb = process.memory_info().rss / 1024 / 1024
    
    # Virtual Memory Size
    vms_mb = process.memory_info().vms / 1024 / 1024
    
    # Peak memory from resource module
    peak_mb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
    
    return {"rss_mb": rss_mb, "vms_mb": vms_mb, "peak_mb": peak_mb}

Thermal benchmarking requires sustained load observation:

import os

def read_cpu_temperature():
    """Read ARM CPU temperature"""
    try:
        with open('/sys/class/thermal/thermal_zone0/temp', 'r') as f:
            return int(f.read()) / 1000.0
    except FileNotFoundError:
        return None

# Track temperature during inference loop
temps = []
for i in range(100):
    session.run(...)
    temps.append(read_cpu_temperature())
    
print(f"Temperature range: {min(temps):.1f}°C - {max(temps):.1f}°C")

Multi-batch benchmarking reveals batching efficiency:

def benchmark_batching(interpreter, batch_sizes=[1, 2, 4, 8, 16]):
    results = {}
    input_shape = (1, 3,224, 224)
    
    for batch in batch_sizes:
        input_data = np.random.randn(batch, *input_shape[1:]).astype(np.float32)
        
        # Measure throughput
        start = time.perf_counter()
        for _ in range(50):
            interpreter.set_tensor(input_index, input_data)
            interpreter.invoke()
        elapsed = (time.perf_counter() - start) / 50
        
        results[batch] = {
            "latency_ms": elapsed * 1000,
            "throughput_samples_per_sec": batch / elapsed
        }
    
    return results