23. Scale Planning

Chapter 23 of 24 · 20 min

KEY INSIGHT

Scale for local AI products means different things than cloud scale. You're not scaling API calls—you're scaling the diversity of hardware configurations your product runs on, the efficiency of model distribution, and the reliability of offline-first architecture. True scale planning for local AI: model compression techniques that expand your reachable market, efficient update mechanisms that work on limited bandwidth, and fallback strategies when constraints are extreme. ```python # scale_planning.py from dataclasses import dataclass, field from typing import Dict, List, Optional, Tuple import math @dataclass class MarketSegment: name: str ram_range_gb: Tuple[float, float] vram_range_gb: Tuple[float, float] target_percentage: float # Of total addressable market primary_use_cases: List[str] @dataclass class ScalingRequirement: segment: MarketSegment recommended_model: str estimated_users: int disk_storage_tb: float bandwidth_requirements_gbps: float support_load_tickets_per_1000_users: float @dataclass class ModelCompressionTarget: original_model: str compression_type: str # 'quantization', 'pruning', 'distillation' target_size_gb: float original_size_gb: float quality_retention_pct: float supported_segments: List[str] class LocalModelScalePlanner: def __init__(self, current_user_count: int, model_configs: Dict): self.current_users = current_user_count self.model_configs = model_configs self.market_segments = self._define_segments() def _define_segments(self) -> List[MarketSegment]: """ Define market segments based on hardware tiers. This informs compression strategy and model recommendations. """ return [ MarketSegment( name="enthusiast", ram_range_gb=(16, 128), vram_range_gb=(8, 80), target_percentage=0.15, primary_use_cases=["code generation", "creative writing", "analysis"] ), MarketSegment( name="professional", ram_range_gb=(8, 16), vram_range_gb=(0, 8), target_percentage=0.35, primary_use_cases=["document processing", "summarization", "classification"] ), MarketSegment( name="mobile_professional", ram_range_gb=(4, 8), vram_range_gb=(0, 4), target_percentage=0.40, primary_use_cases=["drafting", "extraction", "formatting"] ), MarketSegment( name="constrained", ram_range_gb=(2, 4), vram_range_gb=(0, 0), target_percentage=0.10, primary_use_cases=["simple queries", "short generations"] ) ] def calculate_distribution_scale(self, target_users: int) -> Dict: """Calculate infrastructure requirements for reaching target users.""" return { "content_delivery": { "average_model_size_gb": 15, # Weighted average "monthly_downloads": target_users * 0.3, # Avg updates per quarter "peak_bandwidth_gbps": (target_users * 15 * 0.3) / (30 * 86400) * 8, "cdn_regions_needed": 4 }, "support": { "ticket_rate_per_1000": self.estimate_support_tickets(), "monthly_tickets_at_scale": (target_users / 1000) * self.estimate_support_tickets(), "knowledge_base_articles_needed": 25 }, "model_updates": { "update_frequency_monthly": 1, "total_storage_tb_per_region": self.current_users * 15 / 1000 / 4, "download_pacing_recommended": True } } def estimate_support_tickets(self) -> float: """ Estimate support ticket rate based on product complexity. Local AI products have higher initial support (installation issues) but lower ongoing support (no server-side errors to troubleshoot). """ base_rate = 5 # Per 1000 users # Installation complexity adds to initial tickets complexity_factor = 1.5 # Good onboarding reduces ongoing tickets onboarding_quality_factor = 0.7 return base_rate * complexity_factor * onboarding_quality_factor def plan_compression_roadmap(self, target_segments: List[str]) -> List[ModelCompressionTarget]: """ Plan compression investments to expand addressable market. Each compression unlocks new segments. """ compression_targets = [] if "constrained" in target_segments: compression_targets.append(ModelCompressionTarget( original_model="7b", compression_type="int4_quantization", target_size_gb=1.8, original_size_gb=4.0, quality_retention_pct=85, supported_segments=["constrained", "mobile_professional"] )) if "professional" in target_segments: compression_targets.append(ModelCompressionTarget( original_model="13b", compression_type="int4_quantization", target_size_gb=8.0, original_size_gb=26.0, quality_retention_pct=90, supported_segments=["professional", "enthusiast"] )) return compression_targets def calculate_beta_program_roi(self, beta_users: int, fte_cost_per_user: float) -> Dict: """ Calculate ROI of beta program vs production scaling. """ beta_costs = beta_users * fte_cost_per_user beta_value = beta_users * 0.5 # Feedback value estimation support_savings = beta_users * 2 # Finding issues early saves support cost return { "beta_program_cost": beta_costs, "beta_value_generated": beta_value, "support_cost_savings": support_savings, "net_roi": (beta_value + support_savings - beta_costs) / beta_costs if beta_costs > 0 else 0, "recommendation": "recommended" if beta_costs < beta_value else "reconsider" } def create_caching_strategy(self) -> Dict: """ Design caching strategy for model distribution. Key insight: local models are static content—perfect for CDN caching. """ return { "model_artifacts": { "cache_ttl_days": 365, # Models are immutable "strategy": "cacheforever_hashbased", "regional_replication": True }, "application_updates": { "cache_ttl_days": 7, "strategy": "stalewhilerevalidate", "force_update_threshold": 0.8 }, "benchmark_results": { "cache_ttl_hours": 24, "strategy": "runtimedepends", "dynamic_calculation": True } } ``` Scale planning reveals product architecture decisions. If your model sizes require 30GB downloads, you've segmented out mobile professionals. Compression investments directly expand market reach.

EXERCISE

Create a scale planning model that calculates infrastructure requirements for 1,000x user growth. Include CDN costs, support scaling, model update bandwidth, and compression investment ROI. Define a phased rollout plan that matches infrastructure investment to user growth.