19. Monitoring Multi-Tenant

Chapter 19 of 24 · 25 min

KEY INSIGHT

Multi-tenant monitoring requires tenant-context isolation in metrics while providing platform-wide views, with alerting thresholds calibrated for Nigerian network conditions where latency spikes are common. Monitoring a multi-tenant SaaS requires balancing detailed per-tenant visibility with system-wide health monitoring, all while keeping costs manageable for Nigerian operations. ```python from dataclasses import dataclass from typing import Optional import time from prometheus_client import Counter, Histogram, Gauge @dataclass class TenantMetrics: """Metrics container for a single tenant.""" requests_total: Counter requests_errors: Counter request_duration: Histogram ai_tokens_used: Counter quota_remaining: Gauge class MultiTenantMetricsCollector: """Collect and manage metrics across tenants.""" def __init__(self): self._metrics_cache = {} self._tenant_labels = ['tenant_id', 'plan', 'region'] def get_tenant_metrics(self, tenant_id: str, plan: str) -> TenantMetrics: """Get or create metrics for a tenant.""" if tenant_id not in self._metrics_cache: self._metrics_cache[tenant_id] = TenantMetrics( requests_total=Counter( 'saas_requests_total', 'Total requests', self._tenant_labels ), requests_errors=Counter( 'saas_requests_errors_total', 'Total errors', self._tenant_labels ), request_duration=Histogram( 'saas_request_duration_seconds', 'Request duration', self._tenant_labels, buckets=[0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0] ), ai_tokens_used=Counter( 'saas_ai_tokens_total', 'AI tokens consumed', self._tenant_labels ), quota_remaining=Gauge( 'saas_quota_remaining', 'Remaining quota', self._tenant_labels ) ) return self._metrics_cache[tenant_id] def record_request( self, tenant_id: str, plan: str, endpoint: str, status: int, duration: float ): """Record metrics for a request.""" metrics = self.get_tenant_metrics(tenant_id, plan) labels = {'tenant_id': tenant_id, 'plan': plan, 'endpoint': endpoint} metrics.requests_total.labels(**labels).inc() if status >= 400: metrics.requests_errors.labels(**labels).inc() metrics.request_duration.labels(**labels).observe(duration) ``` **Alert Configuration for Nigerian Conditions:** ```python from typing import Callable class AlertingManager: """Manage alerts with tenant-aware thresholds.""" def __init__(self, alert_manager_client): self.client = alert_manager_client self.default_thresholds = { 'error_rate_percent': 5.0, 'latency_p95_ms': 2000, 'latency_p99_ms': 5000, 'quota_usage_percent': 90, 'payment_failure_rate_percent': 10 } def configure_alerts(self, environment: str): """Configure alert rules for environment.""" common_rules = [ { 'name': 'high_error_rate', 'expr': 'rate(saas_requests_errors_total[5m]) / rate(saas_requests_total[5m]) > 0.05', 'severity': 'critical', 'annotations': { 'summary': 'High error rate detected', 'description': 'Error rate exceeds 5% for 5 minutes' } }, { 'name': 'high_latency', 'expr': 'histogram_quantile(0.95, rate(saas_request_duration_seconds_bucket[5m])) > 2', 'severity': 'warning', 'annotations': { 'summary': 'High API latency', 'description': 'P95 latency exceeds 2 seconds' } }, { 'name': 'payment_webhook_failures', 'expr': 'rate(payment_webhook_failures_total[5m]) > 0.1', 'severity': 'critical', 'annotations': { 'summary': 'Payment webhook failures', 'description': 'Payment processor webhooks failing' } } ] for rule in common_rules: self._create_alert_rule(rule) def configure_tenant_alerts(self, tenant_id: str, plan: str): """Configure tenant-specific alerts.""" thresholds = self._get_tenant_thresholds(plan) tenant_rules = [ { 'name': f'tenant_quota_exceeded_{tenant_id}', 'expr': f'saas_quota_remaining{{tenant_id="{tenant_id}"}} < 0', 'severity': 'warning', 'labels': {'tenant_id': tenant_id}, 'annotations': { 'description': f'Tenant {tenant_id} has exceeded quota' } }, { 'name': f'tenant_high_usage_{tenant_id}', 'expr': f'rate(saas_requests_total{{tenant_id="{tenant_id}"}}[1h]) > {thresholds["max_requests_per_hour"]}', 'severity': 'warning', 'labels': {'tenant_id': tenant_id}, 'annotations': { 'description': f'Tenant {tenant_id} approaching rate limit' } } ] for rule in tenant_rules: self._create_alert_rule(rule) ``` **Real-Time Monitoring Dashboard:** ```python class MonitoringDashboard: """Generate monitoring dashboard configuration.""" def __init__(self, grafana_client): self.grafana = grafana_client def create_platform_dashboard(self) -> dict: """Create platform-wide monitoring dashboard.""" dashboard = { 'title': 'SaaS Platform Overview', 'tags': ['platform', 'production'], 'timezone': 'Africa/Lagos', 'refresh': '30s', 'panels': [ self._create_overview_row(), self._create_tenant_health_row(), self._create_payment_status_row(), self._create_infrastructure_row() ] } return self.grafana.create_dashboard(dashboard) def _create_overview_row(self) -> dict: """Create overview metrics row.""" return { 'title': 'Platform Overview', 'gridPos': {'h': 8, 'w': 24, 'x': 0, 'y': 0}, 'targets': [ { 'expr': 'sum(rate(saas_requests_total[5m]))', 'legendFormat': 'Total RPS', 'refId': 'A' }, { 'expr': 'sum(rate(saas_requests_errors_total[5m])) / sum(rate(saas_requests_total[5m])) * 100', 'legendFormat': 'Error Rate %', 'refId': 'B' }, { 'expr': 'histogram_quantile(0.95, sum(rate(saas_request_duration_seconds_bucket[5m])) by (le))', 'legendFormat': 'P95 Latency', 'refId': 'C' } ] } def _create_tenant_health_row(self) -> dict: """Create tenant health monitoring row.""" return { 'title': 'Tenant Health', 'gridPos': {'h': 10, 'w': 24, 'x': 0, 'y': 8}, 'targets': [ { 'expr': 'topk(10, sum(rate(saas_requests_total[5m])) by (tenant_id))', 'legendFormat': '{{tenant_id}}', 'refId': 'A' }, { 'expr': 'topk(10, sum(rate(saas_requests_errors_total[5m])) by (tenant_id))', 'legendFormat': 'Errors - {{tenant_id}}', 'refId': 'B' } ] } ``` **Common Failure Modes:** Monitoring systems that collect excessive metrics quickly become expensive and difficult to query. Implement metric cardinality limits and aggregate tenant metrics into bucketed categories (small/medium/large) for platform-level views. ```python def aggregate_tenant_metrics(): """Aggregate tenant metrics for efficient querying.""" aggregation_queries = [ """ SELECT CASE WHEN tenant_plan IN ('free', 'starter') THEN 'small' WHEN tenant_plan = 'professional' THEN 'medium' ELSE 'enterprise' END as tenant_category, date_trunc('hour', created_at) as hour, sum(requests) as total_requests, sum(errors) as total_errors, avg(avg_latency) as avg_latency FROM tenant_metrics GROUP BY 1, 2 """, """ CREATE MATERIALIZED VIEW tenant_metrics_hourly AS SELECT tenant_id, date_trunc('hour', created_at) as hour, sum(amount) as total_usage, count(*) as request_count FROM usage_records GROUP BY 1, 2 """ ] return aggregation_queries ```

EXERCISE

Create a monitoring setup that tracks per-tenant AI token usage and generates alerts when any tenant exceeds 80% of their monthly allocation. Implement a notification system that sends Slack alerts for critical issues and email summaries for warning-level alerts. Include a dashboard that shows real-time tenant health rankings with drill-down to individual tenant details.