Testing on Device — Edge AI: Mobile and IoT (Chapter 17)

Device testing reveals performance characteristics and failure modes invisible to desktop development. Production ML applications require testing frameworks that measure latency, memory, and accuracy on target hardware under realistic conditions.

On-device unit testing:

import unittest
import numpy as np
import onnxruntime as ort

class TestEdgeInference(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.session = ort.InferenceSession("model.onnx")
        cls.test_data = np.load("test_preprocessing/test_inputs.npz")
    
    def test_output_shape(self):
        input_name = self.session.get_inputs()[0].name
        input_shape = self.session.get_inputs()[0].shape
        
        dummy_output = self.session.run(
            None, 
            {input_name: np.random.randn(*input_shape).astype(np.float32)}
        )
        
        self.assertEqual(dummy_output[0].shape[0], input_shape[0])
    
    def test_accuracy_against_reference(self):
        reference_outputs = self.session.run(None, self.reference_inputs)
        
        current_outputs = self.session.run(None, self.current_inputs)
        
        np.testing.assert_allclose(
            reference_outputs,
            current_outputs,
            rtol=1e-4,
            atol=1e-5,
            err_msg="Model outputs deviate from reference"
        )
    
    def test_latency_requirements(self):
        import time
        
        warmup_rounds = 10
        for _ in range(warmup_rounds):
            self.session.run(self.warmup_input)
        
        timings = []
        for _ in range(100):
            start = time.perf_counter()
            self.session.run(None, self.reference_inputs)
            timings.append((time.perf_counter() - start) * 1000)
        
        p95_latency = np.percentile(timings, 95)
        self.assertLess(p95_latency, 50, f"P95 latency {p95_latency:.2f}ms exceeds 50ms")

# Run on actual device
# adb shell "cd /data/local/tmp && python -m pytest test_edge_inference.py -v"

Android UI testing for ML features:

// Espresso test for ML inference in UI
@RunWith(AndroidJUnit4::class)
class ImageClassificationTest {
    
    @get:Rule
    val activityRule = ActivityScenarioRule(MainActivity::class.java)
    
    @Test
    fun testClassificationDisplaysResult() {
        // Click inference button with test image
        onView(withId(R.id.classifyButton)).perform(click())
        
        // Verify result appears within timeout
        onView(withId(R.id.resultText))
            .waitFor(3000) // 3 second timeout
            .check(matches(not(withText(""))))
    }
    
    @Test
    fun testOfflineClassification() {
        // Enable airplane mode
        activityRule.scenario.onActivity { activity ->
            // Inject network monitor
            val mockNetworkMonitor = FakeNetworkMonitor(connected = false)
            activity.networkMonitor = mockNetworkMonitor
            
            // Trigger classification
            activity.classifyImage(testBitmap)
            
            // Verify offline path used
            assertThat(mockNetworkMonitor.classificationCount).isEqualTo(0)
        }
    }
}

fun ViewInteraction.waitFor(timeoutMs: Long): ViewInteraction {
    val startTime = System.currentTimeMillis()
    while (System.currentTimeMillis() - startTime < timeoutMs) {
        try {
            this.check(matches(isDisplayed()))
            return this
        } catch (e: Exception) {
            Thread.sleep(100)
        }
    }
    throw TimeoutException("View not displayed within ${timeoutMs}ms")
}

iOS XCTest for Core ML:

import XCTest
import CoreML

class CoreMLInferenceTests: XCTestCase {
    var classifier: ImageClassifier!
    
    override func setUp() async throws {
        // Load model from bundle
        let modelURL = Bundle.main.url(forResource: "classifier", withExtension: "mlmodel")!
        classifier = try ImageClassifier(modelURL: modelURL)
    }
    
    func testInferenceOnSimulatedInput() async throws {
        // Create grayscale test image
        let context = CIContext()
        let testImage = CIImage(color: .gray).cropped(to: CGRect(x: 0, y: 0, width: 224, height: 224))
        let cgImage = try context.createCGImage(testImage, from: testImage.extent)
        
        // Verify inference completes
        let result = try await classifier.classify(image: cgImage)
        
        XCTAssertFalse(result.isEmpty, "Classification should return non-empty results")
        XCTAssertGreaterThan(result.first?.confidence ?? 0, 0.5, "Confidence should exceed threshold")
    }
    
    func testPerformance() async throws {
        let iterations = 100
        var latencies: [TimeInterval] = []
        
        for _ in 0..<iterations {
            let start = CFAbsoluteTimeGetCurrent()
            _ = try await classifier.classify(image: testImage)
            let elapsed = CFAbsoluteTimeGetCurrent() - start
            latencies.append(elapsed)
        }
        
        let p95 = latencies.sorted()[Int(Double(iterations) * 0.95)]
        XCTAssertLessThan(p95, 0.1, "P95 inference latency should be under 100ms")
    }
}

Stress testing under memory pressure:

#!/bin/bash
# Stress test: run model inference under memory pressure

# Generate memory pressure
fallocate -l 500M /tmp/fillmem
rm /tmp/fillmem

# Run concurrent inference
for i in {1..100}; do
    adb shell "cd /data/local/tmp && ./run_inference model.onnx input.bin" &
done
wait

echo "Stress test completed"
# Check for OOM errors in logcat
adb logcat -d | grep -E "(OOM|OutOfMemory|ANR)" | head -20