Cost Modeling — Enterprise-Scale RAG (Chapter 17)

Enterprise RAG costs break down into compute, storage, vector database licensing, and LLM API calls. Accurate modeling prevents budget overruns and identifies optimization opportunities.

The cost breakdown structure:

from dataclasses import dataclass
from typing import Optional

@dataclass
class CostCenter:
    name: str
    hourly_compute_cost: float
    storage_gb_cost_monthly: float
    vector_ops_per_hour: int
    embedding_model: str
    embedding_dimension: int

COST_CENTERS = {
    "embeddings": CostCenter(
        name="embeddings",
        hourly_compute_cost=0.50,  # g4dn.xlarge spot
        storage_gb_cost_monthly=0.023,
        vector_ops_per_hour=10000,
        embedding_model="all-MiniLM-L6-v2",
        embedding_dimension=384
    ),
    "llm_inference": CostCenter(
        name="llm_inference",
        hourly_compute_cost=1.20,  # overhead for managed endpoints
        storage_gb_cost_monthly=0.0,
        vector_ops_per_hour=5000,
        embedding_model="gpt-4o",
        embedding_dimension=0
    )
}

def calculate_monthly_cost(
    num_docs: int,
    avg_doc_length_tokens: int,
    embeddings_per_doc: int,
    monthly_queries: int,
    cache_hit_rate: float,
    vector_db_instances: int
) -> dict:
    """Cost model for RAG system"""
    
    # embedding generation costs
    doc_processing_operations = num_docs * embeddings_per_doc
    embedding_compute_hours = doc_processing_operations / 3600  # ~50 docs/sec
    
    # LLM inference costs (reduced by cache hits)
    cached_queries = monthly_queries * cache_hit_rate
    uncached_queries = monthly_queries * (1 - cache_hit_rate)
    
    # Storage costs
    embeddings_per_month = num_docs * embeddings_per_doc * 384 * 4  # bytes
    total_storage_gb = (embeddings_per_month / (1024**3)) 
    metadata_storage_gb = (num_docs * avg_doc_length_tokens * 1.5) / (1024**3)
    
    # Vector database licensing (example: Qdrant Cloud pricing)
    qdrant_storage_cost = total_storage_gb * 25  # $25/GB/month
    qdrant_ops_cost = (monthly_queries * 0.000006) * 1000  # per 1000 ops
    
    return {
        "embedding_compute": embedding_compute_hours * 0.50,
        "llm_uncached_inference": uncached_queries * 0.015,  # $0.015/query
        "vector_db_storage": qdrant_storage_cost,
        "vector_db_ops": qdrant_ops_cost,
        "total_monthly": sum([
            embedding_compute_hours * 0.50,
            uncached_queries * 0.015,
            qdrant_storage_cost,
            qdrant_ops_cost
        ])
    }

Failure Modes:

Underestimating embedding recalculation: Document updates require re-embedding, which often gets excluded from initial models.
Ignoring network egress: Large result sets pushed to clients across regions incur significant egress costs.
Fixed vs. burst pricing: Production spikes significantly alter actual costs. Model p95 usage, not p50.
Model dimension explosion: Switching embedding models from 384-dim to 1536-dim quadruples storage and memory costs.

def cost_per_query_sensitivity(
    base_monthly_queries: int,
    base_cache_hit_rate: float,
    target_cost_reduction: float
) -> tuple[float, float]:
    """Calculate required cache hit rate to achieve cost target"""
    base_cost = calculate_monthly_cost(
        num_docs=100000, avg_doc_length_tokens=500,
        embeddings_per_doc=20, monthly_queries=base_monthly_queries,
        cache_hit_rate=base_cache_hit_rate, vector_db_instances=3
    )
    target_total = base_cost["total_monthly"] * (1 - target_cost_reduction)
    
    # Solve for required cache hit rate
    # Costs that scale with cache: LLM inference only
    llm_cost_per_query = 0.015
    other_costs = base_cost["total_monthly"] - (base_monthly_queries * llm_cost_per_query)
    
    required_hit_rate = (target_total - other_costs) / (base_monthly_queries * llm_cost_per_query)
    required_hit_rate = max(0, min(1, required_hit_rate))
    
    return required_hit_rate, base_cache_hit_rate