KEY INSIGHT
Multi-tenant serving architectures must enforce resource fairness explicitly because some tenants will inevitably submit workloads that attempt to monopolize shared resources.
### Tenant Isolation with Kubernetes Namespaces
```yaml
# tenant-a-deployment.yaml
apiVersion: v1
kind: Namespace
metadata:
name: tenant-a
labels:
tenant: tenant-a
apiVersion: v1
kind: ResourceQuota
metadata:
name: tenant-a-quota
namespace: tenant-a
spec:
hard:
requests.nvidia.com/gpu: "4"
limits.nvidia.com/gpu: "4"
requests.memory: "64Gi"
limits.memory: "64Gi"
requests.cpu: "16"
pods: "10"
---
apiVersion: policyv1
kind: LimitRange
metadata:
name: tenant-a-limits
namespace: tenant-a
spec:
limits:
- type: Container
default:
nvidia.com/gpu: 1
defaultRequest:
nvidia.com/gpu: 1
max:
nvidia.com/gpu: 2
```
### Model Multitenancy with Shared GPU
```python
# multi_tenant_inference.py
from dataclasses import dataclass
from typing import Dict, Optional
import torch
@dataclass
class TenantConfig:
tenant_id: str
model_name: str
max_batch_size: int
memory_limit_gb: int
rate_limit_rpm: int
class MultiTenantInferenceServer:
def __init__(self):
self.tenants: Dict[str, TenantConfig] = {}
self.active_requests: Dict[str, int] {}
self.model_cache: Dict[str, torch.nn.Module] = {}
async def route_request(
self,
tenant_id: str,
request_data: dict
) -> dict:
tenant = self.tenants.get(tenant_id)
if not tenant:
raise ValueError(f"Unknown tenant: {tenant_id}")
# Rate limiting
if self.active_requests[tenant_id] >= tenant.rate_limit_rpm:
raise ValueError(f"Rate limit exceeded for {tenant_id}")
# Memory enforcement
gpu_memory = torch.cuda.memory_allocated()
if gpu_memory > (tenant.memory_limit_gb * 1e9):
self._evict_lru_models(tenant_id)
# Process with tenant's assigned model
model = self._load_model(tenant.model_name)
result = await self._predict(model, request_data)
self.active_requests[tenant_id] += 1
return result
def _load_model(self, model_name: str) -> torch.nn.Module:
if model_name not in self.model_cache:
self.model_cache[model_name] = self._load_from_disk(model_name)
return self.model_cache[model_name]
```
### Isolated Inference with Model Partitioning
```python
# partitioned_inference.py
class PartitionedInference:
"""GPU memory partitioning for isolated tenant workloads."""
@staticmethod
def calculate_partition_sizes(
total_memory_gb: float,
tenant_allocations: Dict[str, float]
) -> Dict[str, tuple]:
"""Calculate GPU memory partitions for each tenant."""
partitions = {}
current_offset_gb = 0.0
for tenant_id, allocation_pct in sorted(
tenant_allocations.items(),
key=lambda x: x[1],
reverse=True
):
partition_size = (total_memory_gb * allocation_pct / 100)
partitions[tenant_id] = (
current_offset_gb,
partition_size
)
current_offset_gb += partition_size
return partitions
def allocate_tenant_memory(
self,
tenant_id: str,
partition_start: float,
partition_size: float
):
"""Set CUDA memory allocator for specific tenant."""
# In production, use custom CUDA memory allocator
# that respects tenant boundaries
pass
```
### Tenant Billing Metrics
```python
# tenant_billing.py
from prometheus_client import Counter
tenant_compute_usage = Counter(
'tenant_gpu_compute_seconds_total',
'Total GPU compute time per tenant',
['tenant_id', 'model_name']
)
tenant_request_count = Counter(
'tenant_requests_total',
'Total requests per tenant',
['tenant_id', 'status']
)
def generate_tenant_invoice(tenant_id: str, period_days: int) -> dict:
"""Generate billing report for tenant."""
compute_seconds = get_metric_sum(
'tenant_gpu_compute_seconds_total',
labels={'tenant_id': tenant_id},
period=f'{period_days}d'
)
requests = get_metric_sum(
'tenant_requests_total',
labels={'tenant_id': tenant_id},
period=f'{period_days}d'
)
# Tiered pricing example
compute_cost = compute_seconds * 0.0001 # $0.36/hour
request_cost = requests * 0.0002 # $0.20/1000 requests
return {
'tenant_id': tenant_id,
'compute_seconds': compute_seconds,
'request_count': requests,
'compute_cost': compute_cost,
'request_cost': request_cost,
'total_cost': compute_cost + request_cost
}
```