21. Model Sizing

Chapter 21 of 24 · 30 min

Model sizing determines the optimal balance between parameters, compute, and quality. Wrong sizing wastes resources or produces underperforming models.

Determining Optimal Model Size

def estimate_model_size(d_model, n_layers, vocab_size, n_kv_heads=None, n_heads=32):
    """
    Estimate total parameters for a decoder-only transformer.
    """
    head_dim = d_model // n_heads
    
    # Embedding
    embedding_params = vocab_size * d_model
    
    # Attention: Q, K, V, O projections
    # Q: d_model -> n_heads * head_dim
    # K: d_model -> n_kv_heads * head_dim (or n_heads if MHA)
    # V: d_model -> n_kv_heads * head_dim
    # O: n_heads * head_dim -> d_model
    kv_heads = n_kv_heads if n_kv_heads else n_heads
    attn_params = (n_heads * d_model * head_dim +  # Q
                   kv_heads * d_model * head_dim +  # K
                   kv_heads * d_model * head_dim +  # V
                   n_heads * head_dim * d_model) * n_layers  # O
    
    # FFN: typically 4x expansion
    d_ff = d_model * 4
    ffn_params = (d_model * d_ff + d_ff * d_model) * n_layers
    
    # Layer norms: 2 per layer + final
    ln_params = (2 * d_model * n_layers) + (2 * d_model)
    
    return {
        'embedding': embedding_params,
        'attention': attn_params,
        'ffn': ffn_params,
        'layer_norms': ln_params,
        'total': embedding_params + attn_params + ffn_params + ln_params
    }

def format_params(n):
    """Format parameter count human-readable"""
    if n >= 1e12:
        return f"{n/1e12:.1f}T"
    elif n >= 1e9:
        return f"{n/1e9:.1f}B"
    elif n >= 1e6:
        return f"{n/1e6:.1f}M"
    else:
        return f"{n:.0f}"

# Example: Llama 7B
size = estimate_model_size(
    d_model=4096,
    n_layers=32,
    vocab_size=32000,
    n_kv_heads=8,
    n_heads=32
)
print(f"Total parameters: {format_params(size['total'])}")

Compute Budget Allocation

def allocate_compute_budget(total_flops: float, config: Dict) -> Dict:
    """
    Allocate compute budget across architecture components.
    """
    n_layers = config['n_layers']
    d_model = config['d_model']
    d_ff = config.get('d_ff', d_model * 4)
    
    # Per-layer FLOPs estimate
    # Attention: 4 * d_model * d_model * seq_len * n_layers (approx)
    # FFN: 6 * d_model * d_ff * seq_len * n_layers (approx)
    
    # This is for a single forward pass
    # Training requires 6x (forward + backward)
    
    attn_flops_per_layer = 4 * d_model * d_model * n_layers
    ffn_flops_per_layer = 6 * d_model * d_ff * n_layers
    
    # Estimate cost ratios
    attn_fraction = attn_flops_per_layer / (attn_flops_per_layer + ffn_flops_per_layer)
    
    return {
        'attention_flops': total_flops * attn_fraction,
        'ffn_flops': total_flops * (1 - attn_fraction),
        'ratio': attn_fraction / (1 - attn_fraction)
    }

def optimize_size_for_budget(compute_budget: float, 
                             target_tokens: int,
                             quality_weight: float = 0.5) -> Dict:
    """
    Find optimal model size given compute budget and quality preferences.
    """
    candidates = []
    
    for n_layers in [8, 12, 16, 24, 32, 40]:
        for d_model in [2048, 2560, 3072, 4096, 5120]:
            # Estimate parameters
            params = estimate_model_size(d_model, n_layers, 32000)['total']
            
            # Training compute: 6 * params * tokens
            required_compute = 6 * params * target_tokens
            
            if required_compute <= compute_budget:
                # Quality estimate (simplified)
                # Larger models are more parameter-efficient
                quality = (params / 1e9) ** 0.2  # Diminishing returns
                
                efficiency = quality / (required_compute / compute_budget)
                
                candidates.append({
                    'n_layers': n_layers,
                    'd_model': d_model,
                    'params': params,
                    'quality_estimate': quality,
                    'efficiency': efficiency,
                    'compute_utilization': required_compute / compute_budget
                })
    
    return sorted(candidates, key=lambda x: x['efficiency'], reverse=True)

Depth vs Width Tradeoffs

def compare_depth_width(total_params: float, 
                       tokenizer_vocab: int = 32000) -> List[Dict]:
    """
    Compare different depth/width configurations with same parameter budget.
    """
    configurations = []
    
    # Various depth/width ratios
    for depth_ratio in [0.5, 1.0, 1.5, 2.0]:  # Width relative to depth
        # Solve for d_model and n_layers given constraints
        
        # Simple model: params ~ 12 * d_model^2 * n_layers + vocab * d_model
        # Ignoring vocab for now (small relative to transformer params)
        
        # At depth_ratio = 1, equal contribution from width and depth
        # width contribution ~ d_model^2
        # depth contribution ~ n_layers * d_model
        
        # Let's use practical configurations
        if depth_ratio <= 1.0:
            d_model = 4096
            n_layers = int(32 * depth_ratio)
        else:
            d_model = int(4096 * (depth_ratio ** 0.5))
            n_layers = int(32 * depth_ratio ** 0.5)
        
        params = estimate_model_size(d_model, n_layers, tokenizer_vocab)['total']
        
        # Rough quality estimate: depth helps more for reasoning
        # width helps more for knowledge storage
        quality_depth = n_layers ** 0.1
        quality_width = d_model ** 0.05
        
        configurations.append({
            'd_model': d_model,
            'n_layers': n_layers,
            'params': params,
            'depth_ratio': depth_ratio,
            'quality_estimate': quality_depth * quality_width,
            'memory_per_token_gb': estimate_kv_memory(d_model, n_layers, tokenizer_vocab)
        })
    
    return configurations

def estimate_kv_memory(d_model, n_layers, seq_len, n_kv_heads=8, head_dim=128):
    """Estimate KV cache memory per token"""
    # Per token KV: 2 * n_layers * n_kv_heads * head_dim * 2 (k + v) * 2 bytes (bf16)
    kv_per_token = 2 * n_layers * n_kv_heads * head_dim * 4 / 1e9
    return kv_per_token * seq_len

Failure Mode: Ignoring Vocabulary Size

# BUG: Forgetting embedding parameters
def incorrect_param_estimate(d_model, n_layers):
    # Only counting transformer params
    return 12 * d_model * d_model * n_layers + 4 * d_model * d_model * 4 * n_layers

# For vocab=100k and d_model=4096:
# Incorrect: ~1.9B params (without embeddings)
# Correct: ~2.5B params (with embeddings)
# 25% underestimate!

# At scale:
# 70B parameters might actually be 75B if vocab is larger
# This affects compute calculations significantly

# FIX: Always include all parameter sources
def complete_param_estimate(d_model, n_layers, vocab_size):
    transformer = 12 * d_model * d_model * n_layers  # attention
    transformer += 8 * d_model * d_model * 4 * n_layers  # ffn
    transformer += 4 * d_model * n_layers  # layer norms
    
    embeddings = vocab_size * d_model  # embedding
    embeddings += vocab_size * d_model  # lm_head (often tied, but allocated)
    
    return transformer + embeddings
EXERCISE

Given a compute budget of 10²⁴ FLOPs, generate all model configurations that fit within this budget. Rank by estimated quality and create a table showing the tradeoff between parameter count and model depth.