21. Monitoring and Alerting

Chapter 21 of 24 · 25 min

KEY INSIGHT

Full-spectrum monitoring for ML serving involves tracking infrastructure metrics, model performance metrics, and business outcome metrics. Effective alerting balances sensitivity (catching real issues) against noise (avoiding alert fatigue that causes teams to ignore warnings). ### Monitoring Architecture Local AI deployments require thoughtful monitoring architecture because you cannot rely on cloud-native observability platforms. Build a metrics collection stack appropriate to your infrastructure constraints. ```python # Python: Metrics collection infrastructure for ML serving from dataclasses import dataclass, field from typing import Optional import time import threading from collections import deque from enum import Enum class MetricType(Enum): COUNTER = "counter" # Monotonically increasing GAUGE = "gauge" # Point-in-time value HISTOGRAM = "histogram" # Distribution statistics SUMMARY = "summary" # Quantile-based statistics @dataclass class Metric: name: str value: float labels: dict timestamp: int metric_type: MetricType class LocalMetricsCollector: """ Lightweight metrics collector for ML serving. Stores metrics in memory with optional forwarding to Prometheus. """ def __init__(self, push_gateway_url: Optional[str] = None): self.metrics = {} # name -> deque of Metric self.push_gateway_url = push_gateway_url self.lock = threading.Lock() def record_counter(self, name: str, value: float = 1, **labels): self._record(MetricType.COUNTER, name, value, labels) def record_gauge(self, name: str, value: float, **labels): self._record(MetricType.GAUGE, name, value, labels) def record_histogram(self, name: str, value: float, **labels): self._record(MetricType.HISTOGRAM, name, value, labels) def _record(self, metric_type: MetricType, name: str, value: float, labels: dict): metric = Metric( name=name, value=value, labels=labels, timestamp=int(time.time()), metric_type=metric_type ) with self.lock: if name not in self.metrics: self.metrics[name] = deque(maxlen=10000) self.metrics[name].append(metric) def get_metrics(self, name: Optional[str] = None) -> list[Metric]: """Retrieve stored metrics.""" with self.lock: if name: return list(self.metrics.get(name, [])) return [m for metrics in self.metrics.values() for m in metrics] def compute_histogram_stats(self, name: str) -> dict: """Compute histogram statistics for bucketed metric.""" values = [m.value for m in self.metrics.get(name, [])] if not values: return {} sorted_values = sorted(values) n = len(sorted_values) return { "count": n, "sum": sum(values), "mean": sum(values) / n, "min": min(values), "max": max(values), "p50": sorted_values[int(n * 0.5)], "p95": sorted_values[int(n * 0.95)], "p99": sorted_values[int(n * 0.99)], } # Prometheus metrics exporter endpoint def prometheus_metrics_endpoint(collector: LocalMetricsCollector) -> str: """Format metrics for Prometheus scraping.""" metrics_output = [] for name, metrics in collector.metrics.items(): metric_types = {m.metric_type for m in metrics} for m in metrics: labels_str = ",".join( f'{k}="{v}"' for k, v in m.labels.items() ) if metric_types == {MetricType.GAUGE}: metric_type_str = "# TYPE {} gauge".format(name) elif metric_types == {MetricType.HISTOGRAM}: metric_type_str = "# TYPE {} histogram".format(name) else: metric_type_str = "# TYPE {} counter".format(name) metrics_output.append( "{}{{{}}} {}".format(name, labels_str, m.value) ) return "\n".join(metrics_output) # Usage: Integrate into model serving # from prometheus_client import Counter, Histogram, Gauge, start_http_server # # # Standard ML serving metrics # inference_latency = Histogram( # 'ml_inference_latency_seconds', # 'Inference latency in seconds', # ['model_name', 'model_version'] # ) # # prediction_confidence = Histogram( # 'ml_prediction_confidence', # 'Model prediction confidence scores', # ['model_name', 'model_version'] # ) # # requests_total = Counter( # 'ml_requests_total', # 'Total inference requests', # ['model_name', 'model_version', 'status'] # ) ``` ### Alerts Configuration ```yaml # YAML: Alert configuration for ML serving # File: alerts/ml-serving-alerts.yaml groups: - name: ml_serving_infrastructure interval: 30s rules: # Infrastructure alerts - alert: HighCPUUsage expr: cpu_usage_percent > 90 for: 5m labels: severity: warning team: ml-ops annotations: summary: "High CPU usage on {{ $labels.instance }}" description: "CPU usage at {{ $value }}% for 5+ minutes" - alert: OutOfMemory expr: memory_usage_percent > 95 for: 1m labels: severity: critical team: ml-ops annotations: summary: "Memory exhaustion imminent on {{ $labels.instance }}" - alert: ModelServiceDown expr: up{job="ml-serving"} == 0 for: 1m labels: severity: critical team: ml-ops annotations: summary: "Model serving service down on {{ $labels.instance }}" - name: ml_model_performance interval: 60s rules: # Model performance alerts - alert: HighInferenceLatency expr: histogram_quantile(0.95, ml_inference_latency_seconds) > 2.0 for: 10m labels: severity: warning team: ml-ops annotations: summary: "High inference latency for {{ $labels.model_name }}" description: "P95 latency at {{ $value }}s, threshold 2.0s" - alert: ModelConfidenceDrift expr: | avg(ml_prediction_confidence{model_name="sentiment"}) < avg(ml_prediction_confidence{model_name="sentiment"}) offset 7d for: 24h labels: severity: warning team: ml-ops annotations: summary: "Prediction confidence drift detected for {{ $labels.model_name }}" - alert: ExtensiveDataDrift expr: model_drift_score > 0.15 for: 1h labels: severity: warning team: ml-ops annotations: summary: "Significant data drift detected for {{ $labels.model_name }}" description: "Drift score {{ $value }} exceeds threshold 0.15" - alert: PredictionAccuracyDegradation expr: | (sum(prediction_correct{service="sentiment"}) / sum(prediction_total{service="sentiment"})) < 0.80 for: 1h labels: severity: critical team: ml-ops annotations: summary: "Model accuracy below threshold for {{ $labels.service }}" description: "Current accuracy {{ $value | printf \"%.2f\" }}, threshold 0.80" - name: ml_business_metrics interval: 300s rules: # Business outcome alerts - alert: LowUserSatisfaction expr: avg(user_feedback_score{service="inference"}) < 3.0 for: 1d labels: severity: warning team: ml-ops annotations: summary: "User satisfaction score below target for {{ $labels.service }}" ``` ### Dashboard Design Effective ML monitoring dashboards show what's happening (current state), what happened (trends), and what matters (impact on outcomes). Design dashboards for different audiences: operators need system health; stakeholders need business impact. Build dashboards around questions: Is the service up? Is it responding quickly? Are predictions accurate? Are users receiving value?

EXERCISE

Instrument your model serving code with metrics collection. Define and implement alerts for: inference latency P99 exceeds 95th percentile of your SLA, prediction confidence drops below training baseline, and requests are failing. Verify alerts trigger correctly by introducing artificial failures or delays in a test environment.