KEY INSIGHT
Full-spectrum monitoring for ML serving involves tracking infrastructure metrics, model performance metrics, and business outcome metrics. Effective alerting balances sensitivity (catching real issues) against noise (avoiding alert fatigue that causes teams to ignore warnings).
### Monitoring Architecture
Local AI deployments require thoughtful monitoring architecture because you cannot rely on cloud-native observability platforms. Build a metrics collection stack appropriate to your infrastructure constraints.
```python
# Python: Metrics collection infrastructure for ML serving
from dataclasses import dataclass, field
from typing import Optional
import time
import threading
from collections import deque
from enum import Enum
class MetricType(Enum):
COUNTER = "counter" # Monotonically increasing
GAUGE = "gauge" # Point-in-time value
HISTOGRAM = "histogram" # Distribution statistics
SUMMARY = "summary" # Quantile-based statistics
@dataclass
class Metric:
name: str
value: float
labels: dict
timestamp: int
metric_type: MetricType
class LocalMetricsCollector:
"""
Lightweight metrics collector for ML serving.
Stores metrics in memory with optional forwarding to Prometheus.
"""
def __init__(self, push_gateway_url: Optional[str] = None):
self.metrics = {} # name -> deque of Metric
self.push_gateway_url = push_gateway_url
self.lock = threading.Lock()
def record_counter(self, name: str, value: float = 1, **labels):
self._record(MetricType.COUNTER, name, value, labels)
def record_gauge(self, name: str, value: float, **labels):
self._record(MetricType.GAUGE, name, value, labels)
def record_histogram(self, name: str, value: float, **labels):
self._record(MetricType.HISTOGRAM, name, value, labels)
def _record(self, metric_type: MetricType, name: str, value: float, labels: dict):
metric = Metric(
name=name,
value=value,
labels=labels,
timestamp=int(time.time()),
metric_type=metric_type
)
with self.lock:
if name not in self.metrics:
self.metrics[name] = deque(maxlen=10000)
self.metrics[name].append(metric)
def get_metrics(self, name: Optional[str] = None) -> list[Metric]:
"""Retrieve stored metrics."""
with self.lock:
if name:
return list(self.metrics.get(name, []))
return [m for metrics in self.metrics.values() for m in metrics]
def compute_histogram_stats(self, name: str) -> dict:
"""Compute histogram statistics for bucketed metric."""
values = [m.value for m in self.metrics.get(name, [])]
if not values:
return {}
sorted_values = sorted(values)
n = len(sorted_values)
return {
"count": n,
"sum": sum(values),
"mean": sum(values) / n,
"min": min(values),
"max": max(values),
"p50": sorted_values[int(n * 0.5)],
"p95": sorted_values[int(n * 0.95)],
"p99": sorted_values[int(n * 0.99)],
}
# Prometheus metrics exporter endpoint
def prometheus_metrics_endpoint(collector: LocalMetricsCollector) -> str:
"""Format metrics for Prometheus scraping."""
metrics_output = []
for name, metrics in collector.metrics.items():
metric_types = {m.metric_type for m in metrics}
for m in metrics:
labels_str = ",".join(
f'{k}="{v}"' for k, v in m.labels.items()
)
if metric_types == {MetricType.GAUGE}:
metric_type_str = "# TYPE {} gauge".format(name)
elif metric_types == {MetricType.HISTOGRAM}:
metric_type_str = "# TYPE {} histogram".format(name)
else:
metric_type_str = "# TYPE {} counter".format(name)
metrics_output.append(
"{}{{{}}} {}".format(name, labels_str, m.value)
)
return "\n".join(metrics_output)
# Usage: Integrate into model serving
# from prometheus_client import Counter, Histogram, Gauge, start_http_server
#
# # Standard ML serving metrics
# inference_latency = Histogram(
# 'ml_inference_latency_seconds',
# 'Inference latency in seconds',
# ['model_name', 'model_version']
# )
#
# prediction_confidence = Histogram(
# 'ml_prediction_confidence',
# 'Model prediction confidence scores',
# ['model_name', 'model_version']
# )
#
# requests_total = Counter(
# 'ml_requests_total',
# 'Total inference requests',
# ['model_name', 'model_version', 'status']
# )
```
### Alerts Configuration
```yaml
# YAML: Alert configuration for ML serving
# File: alerts/ml-serving-alerts.yaml
groups:
- name: ml_serving_infrastructure
interval: 30s
rules:
# Infrastructure alerts
- alert: HighCPUUsage
expr: cpu_usage_percent > 90
for: 5m
labels:
severity: warning
team: ml-ops
annotations:
summary: "High CPU usage on {{ $labels.instance }}"
description: "CPU usage at {{ $value }}% for 5+ minutes"
- alert: OutOfMemory
expr: memory_usage_percent > 95
for: 1m
labels:
severity: critical
team: ml-ops
annotations:
summary: "Memory exhaustion imminent on {{ $labels.instance }}"
- alert: ModelServiceDown
expr: up{job="ml-serving"} == 0
for: 1m
labels:
severity: critical
team: ml-ops
annotations:
summary: "Model serving service down on {{ $labels.instance }}"
- name: ml_model_performance
interval: 60s
rules:
# Model performance alerts
- alert: HighInferenceLatency
expr: histogram_quantile(0.95, ml_inference_latency_seconds) > 2.0
for: 10m
labels:
severity: warning
team: ml-ops
annotations:
summary: "High inference latency for {{ $labels.model_name }}"
description: "P95 latency at {{ $value }}s, threshold 2.0s"
- alert: ModelConfidenceDrift
expr: |
avg(ml_prediction_confidence{model_name="sentiment"})
< avg(ml_prediction_confidence{model_name="sentiment"}) offset 7d
for: 24h
labels:
severity: warning
team: ml-ops
annotations:
summary: "Prediction confidence drift detected for {{ $labels.model_name }}"
- alert: ExtensiveDataDrift
expr: model_drift_score > 0.15
for: 1h
labels:
severity: warning
team: ml-ops
annotations:
summary: "Significant data drift detected for {{ $labels.model_name }}"
description: "Drift score {{ $value }} exceeds threshold 0.15"
- alert: PredictionAccuracyDegradation
expr: |
(sum(prediction_correct{service="sentiment"}) / sum(prediction_total{service="sentiment"}))
< 0.80
for: 1h
labels:
severity: critical
team: ml-ops
annotations:
summary: "Model accuracy below threshold for {{ $labels.service }}"
description: "Current accuracy {{ $value | printf \"%.2f\" }}, threshold 0.80"
- name: ml_business_metrics
interval: 300s
rules:
# Business outcome alerts
- alert: LowUserSatisfaction
expr: avg(user_feedback_score{service="inference"}) < 3.0
for: 1d
labels:
severity: warning
team: ml-ops
annotations:
summary: "User satisfaction score below target for {{ $labels.service }}"
```
### Dashboard Design
Effective ML monitoring dashboards show what's happening (current state), what happened (trends), and what matters (impact on outcomes). Design dashboards for different audiences: operators need system health; stakeholders need business impact.
Build dashboards around questions: Is the service up? Is it responding quickly? Are predictions accurate? Are users receiving value?