KEY INSIGHT
Multi-tenant monitoring requires tenant-context isolation in metrics while providing platform-wide views, with alerting thresholds calibrated for Nigerian network conditions where latency spikes are common.
Monitoring a multi-tenant SaaS requires balancing detailed per-tenant visibility with system-wide health monitoring, all while keeping costs manageable for Nigerian operations.
```python
from dataclasses import dataclass
from typing import Optional
import time
from prometheus_client import Counter, Histogram, Gauge
@dataclass
class TenantMetrics:
"""Metrics container for a single tenant."""
requests_total: Counter
requests_errors: Counter
request_duration: Histogram
ai_tokens_used: Counter
quota_remaining: Gauge
class MultiTenantMetricsCollector:
"""Collect and manage metrics across tenants."""
def __init__(self):
self._metrics_cache = {}
self._tenant_labels = ['tenant_id', 'plan', 'region']
def get_tenant_metrics(self, tenant_id: str, plan: str) -> TenantMetrics:
"""Get or create metrics for a tenant."""
if tenant_id not in self._metrics_cache:
self._metrics_cache[tenant_id] = TenantMetrics(
requests_total=Counter(
'saas_requests_total',
'Total requests',
self._tenant_labels
),
requests_errors=Counter(
'saas_requests_errors_total',
'Total errors',
self._tenant_labels
),
request_duration=Histogram(
'saas_request_duration_seconds',
'Request duration',
self._tenant_labels,
buckets=[0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0]
),
ai_tokens_used=Counter(
'saas_ai_tokens_total',
'AI tokens consumed',
self._tenant_labels
),
quota_remaining=Gauge(
'saas_quota_remaining',
'Remaining quota',
self._tenant_labels
)
)
return self._metrics_cache[tenant_id]
def record_request(
self,
tenant_id: str,
plan: str,
endpoint: str,
status: int,
duration: float
):
"""Record metrics for a request."""
metrics = self.get_tenant_metrics(tenant_id, plan)
labels = {'tenant_id': tenant_id, 'plan': plan, 'endpoint': endpoint}
metrics.requests_total.labels(**labels).inc()
if status >= 400:
metrics.requests_errors.labels(**labels).inc()
metrics.request_duration.labels(**labels).observe(duration)
```
**Alert Configuration for Nigerian Conditions:**
```python
from typing import Callable
class AlertingManager:
"""Manage alerts with tenant-aware thresholds."""
def __init__(self, alert_manager_client):
self.client = alert_manager_client
self.default_thresholds = {
'error_rate_percent': 5.0,
'latency_p95_ms': 2000,
'latency_p99_ms': 5000,
'quota_usage_percent': 90,
'payment_failure_rate_percent': 10
}
def configure_alerts(self, environment: str):
"""Configure alert rules for environment."""
common_rules = [
{
'name': 'high_error_rate',
'expr': 'rate(saas_requests_errors_total[5m]) / rate(saas_requests_total[5m]) > 0.05',
'severity': 'critical',
'annotations': {
'summary': 'High error rate detected',
'description': 'Error rate exceeds 5% for 5 minutes'
}
},
{
'name': 'high_latency',
'expr': 'histogram_quantile(0.95, rate(saas_request_duration_seconds_bucket[5m])) > 2',
'severity': 'warning',
'annotations': {
'summary': 'High API latency',
'description': 'P95 latency exceeds 2 seconds'
}
},
{
'name': 'payment_webhook_failures',
'expr': 'rate(payment_webhook_failures_total[5m]) > 0.1',
'severity': 'critical',
'annotations': {
'summary': 'Payment webhook failures',
'description': 'Payment processor webhooks failing'
}
}
]
for rule in common_rules:
self._create_alert_rule(rule)
def configure_tenant_alerts(self, tenant_id: str, plan: str):
"""Configure tenant-specific alerts."""
thresholds = self._get_tenant_thresholds(plan)
tenant_rules = [
{
'name': f'tenant_quota_exceeded_{tenant_id}',
'expr': f'saas_quota_remaining{{tenant_id="{tenant_id}"}} < 0',
'severity': 'warning',
'labels': {'tenant_id': tenant_id},
'annotations': {
'description': f'Tenant {tenant_id} has exceeded quota'
}
},
{
'name': f'tenant_high_usage_{tenant_id}',
'expr': f'rate(saas_requests_total{{tenant_id="{tenant_id}"}}[1h]) > {thresholds["max_requests_per_hour"]}',
'severity': 'warning',
'labels': {'tenant_id': tenant_id},
'annotations': {
'description': f'Tenant {tenant_id} approaching rate limit'
}
}
]
for rule in tenant_rules:
self._create_alert_rule(rule)
```
**Real-Time Monitoring Dashboard:**
```python
class MonitoringDashboard:
"""Generate monitoring dashboard configuration."""
def __init__(self, grafana_client):
self.grafana = grafana_client
def create_platform_dashboard(self) -> dict:
"""Create platform-wide monitoring dashboard."""
dashboard = {
'title': 'SaaS Platform Overview',
'tags': ['platform', 'production'],
'timezone': 'Africa/Lagos',
'refresh': '30s',
'panels': [
self._create_overview_row(),
self._create_tenant_health_row(),
self._create_payment_status_row(),
self._create_infrastructure_row()
]
}
return self.grafana.create_dashboard(dashboard)
def _create_overview_row(self) -> dict:
"""Create overview metrics row."""
return {
'title': 'Platform Overview',
'gridPos': {'h': 8, 'w': 24, 'x': 0, 'y': 0},
'targets': [
{
'expr': 'sum(rate(saas_requests_total[5m]))',
'legendFormat': 'Total RPS',
'refId': 'A'
},
{
'expr': 'sum(rate(saas_requests_errors_total[5m])) / sum(rate(saas_requests_total[5m])) * 100',
'legendFormat': 'Error Rate %',
'refId': 'B'
},
{
'expr': 'histogram_quantile(0.95, sum(rate(saas_request_duration_seconds_bucket[5m])) by (le))',
'legendFormat': 'P95 Latency',
'refId': 'C'
}
]
}
def _create_tenant_health_row(self) -> dict:
"""Create tenant health monitoring row."""
return {
'title': 'Tenant Health',
'gridPos': {'h': 10, 'w': 24, 'x': 0, 'y': 8},
'targets': [
{
'expr': 'topk(10, sum(rate(saas_requests_total[5m])) by (tenant_id))',
'legendFormat': '{{tenant_id}}',
'refId': 'A'
},
{
'expr': 'topk(10, sum(rate(saas_requests_errors_total[5m])) by (tenant_id))',
'legendFormat': 'Errors - {{tenant_id}}',
'refId': 'B'
}
]
}
```
**Common Failure Modes:**
Monitoring systems that collect excessive metrics quickly become expensive and difficult to query. Implement metric cardinality limits and aggregate tenant metrics into bucketed categories (small/medium/large) for platform-level views.
```python
def aggregate_tenant_metrics():
"""Aggregate tenant metrics for efficient querying."""
aggregation_queries = [
"""
SELECT
CASE
WHEN tenant_plan IN ('free', 'starter') THEN 'small'
WHEN tenant_plan = 'professional' THEN 'medium'
ELSE 'enterprise'
END as tenant_category,
date_trunc('hour', created_at) as hour,
sum(requests) as total_requests,
sum(errors) as total_errors,
avg(avg_latency) as avg_latency
FROM tenant_metrics
GROUP BY 1, 2
""",
"""
CREATE MATERIALIZED VIEW tenant_metrics_hourly AS
SELECT
tenant_id,
date_trunc('hour', created_at) as hour,
sum(amount) as total_usage,
count(*) as request_count
FROM usage_records
GROUP BY 1, 2
"""
]
return aggregation_queries
```