22. Incident Response
Chapter 22 of 24 · 20 min
RAG system incidents require rapid diagnosis across multiple components—vector search, LLM generation, caching, and data pipelines. Structured response prevents decision paralysis under pressure.
The incident response process:
from dataclasses import dataclass
from enum import Enum
import json
class Severity(Enum):
P1_CRITICAL = "Critical - Complete outage"
P2_HIGH = "High - Major feature degraded"
P3_MEDIUM = "Medium - Minor feature degraded"
P4_LOW = "Low - No immediate user impact"
@dataclass
class Incident:
id: str
severity: Severity
title: str
affected_components: list[str]
start_time: str
commander: str
status: str = "investigating"
class RAGIncidentManager:
def __init__(self, pagerduty_client, metrics_client):
self.pd = pagerduty_client
self.metrics = metrics_client
def open_incident(self, severity: Severity, title: str,
affected: list[str]) -> Incident:
incident = Incident(
id=self._generate_id(),
severity=severity,
title=title,
affected_components=affected,
start_time=datetime.utcnow().isoformat(),
commander=self._assign_commander()
)
# Page on-call if P1/P2
if severity in (Severity.P1_CRITICAL, Severity.P2_HIGH):
self.pd.create_incident(
title=title,
severity=severity.value,
incident_key=incident.id
)
# Initialize runbook from template
self._create_runbook(incident)
return incident
def diagnose_vector_search_issue(self) -> dict:
"""Structured diagnostic for vector search problems"""
diagnostics = {}
# Metric 1: Vector DB health
qdrant_health = self._check_endpoint("/health")
diagnostics["vector_db_healthy"] = qdrant_health
# Metric 2: Index size and memory
index_info = self._get_collection_info("chunks")
diagnostics["index_size_vectors"] = index_info.get("vectors_count")
diagnostics["index_size_gb"] = index_info.get("storage_size_bytes", 0) / (1024**3)
# Metric 3: Search latency distribution
recent_searches = self.metrics.query(
metric="vector_search_latency_ms",
labels={"status": "success"},
minutes=5
)
diagnostics["p95_search_ms"] = self._percentile(recent_searches, 0.95)
diagnostics["p99_search_ms"] = self._percentile(recent_searches, 0.99)
# Metric 4: Cache hit rate anomaly
cache_metrics = self.metrics.query(
metric="semantic_cache_hit_rate",
minutes=30
)
diagnostics["cache_hit_rate"] = np.mean(cache_metrics)
diagnostics["cache_hit_rate_anomaly"] = diagnostics["cache_hit_rate"] < 0.5
return diagnostics
Common RAG failure patterns and resolutions:
INCIDENT_PLAYBOOKS = {
"vector_db_unhealthy": {
"check": lambda: self._check_endpoint("/health"),
"mitigation": "Switch to read-from-replica mode; scale up if storage fragmentation",
"escalation": "P1 if search latency > 2000ms for > 5 minutes"
},
"embedding_service_timeout": {
"check": lambda: self._check_endpoint("/ready"),
"mitigation": "Restart embedding service pods; check GPU memory",
"escalation": "P1 if ingestion pipeline stalled for > 10 minutes"
},
"llm_generation_latency_spike": {
"check": lambda: self.metrics.query("llm_generation_latency_ms", minutes=5),
"mitigation": "Check API rate limits; switch to fallback model if configured",
"escalation": "P2 if p95 > 10000ms"
},
"cache_eviction_storm": {
"check": lambda: self.metrics.query("cache_size_bytes", minutes=5),
"mitigation": "Increase Redis memory; adjust eviction policy to volatile-lru",
"escalation": "P2 if hit rate < 10% causing LLM cost spike"
}
}
EXERCISE
Create a runbook for a P1 incident where vector search p99 latency exceeds 5000ms. Include diagnostic commands, escalation criteria, and mitigation steps.