Model Sizing — Custom LLM Architecture Design (Chapter 21)

Model sizing determines the optimal balance between parameters, compute, and quality. Wrong sizing wastes resources or produces underperforming models.

Determining Optimal Model Size

def estimate_model_size(d_model, n_layers, vocab_size, n_kv_heads=None, n_heads=32):
    """
    Estimate total parameters for a decoder-only transformer.
    """
    head_dim = d_model // n_heads
    
    # Embedding
    embedding_params = vocab_size * d_model
    
    # Attention: Q, K, V, O projections
    # Q: d_model -> n_heads * head_dim
    # K: d_model -> n_kv_heads * head_dim (or n_heads if MHA)
    # V: d_model -> n_kv_heads * head_dim
    # O: n_heads * head_dim -> d_model
    kv_heads = n_kv_heads if n_kv_heads else n_heads
    attn_params = (n_heads * d_model * head_dim +  # Q
                   kv_heads * d_model * head_dim +  # K
                   kv_heads * d_model * head_dim +  # V
                   n_heads * head_dim * d_model) * n_layers  # O
    
    # FFN: typically 4x expansion
    d_ff = d_model * 4
    ffn_params = (d_model * d_ff + d_ff * d_model) * n_layers
    
    # Layer norms: 2 per layer + final
    ln_params = (2 * d_model * n_layers) + (2 * d_model)
    
    return {
        'embedding': embedding_params,
        'attention': attn_params,
        'ffn': ffn_params,
        'layer_norms': ln_params,
        'total': embedding_params + attn_params + ffn_params + ln_params
    }

def format_params(n):
    """Format parameter count human-readable"""
    if n >= 1e12:
        return f"{n/1e12:.1f}T"
    elif n >= 1e9:
        return f"{n/1e9:.1f}B"
    elif n >= 1e6:
        return f"{n/1e6:.1f}M"
    else:
        return f"{n:.0f}"

# Example: Llama 7B
size = estimate_model_size(
    d_model=4096,
    n_layers=32,
    vocab_size=32000,
    n_kv_heads=8,
    n_heads=32
)
print(f"Total parameters: {format_params(size['total'])}")

Compute Budget Allocation

def allocate_compute_budget(total_flops: float, config: Dict) -> Dict:
    """
    Allocate compute budget across architecture components.
    """
    n_layers = config['n_layers']
    d_model = config['d_model']
    d_ff = config.get('d_ff', d_model * 4)
    
    # Per-layer FLOPs estimate
    # Attention: 4 * d_model * d_model * seq_len * n_layers (approx)
    # FFN: 6 * d_model * d_ff * seq_len * n_layers (approx)
    
    # This is for a single forward pass
    # Training requires 6x (forward + backward)
    
    attn_flops_per_layer = 4 * d_model * d_model * n_layers
    ffn_flops_per_layer = 6 * d_model * d_ff * n_layers
    
    # Estimate cost ratios
    attn_fraction = attn_flops_per_layer / (attn_flops_per_layer + ffn_flops_per_layer)
    
    return {
        'attention_flops': total_flops * attn_fraction,
        'ffn_flops': total_flops * (1 - attn_fraction),
        'ratio': attn_fraction / (1 - attn_fraction)
    }

def optimize_size_for_budget(compute_budget: float, 
                             target_tokens: int,
                             quality_weight: float = 0.5) -> Dict:
    """
    Find optimal model size given compute budget and quality preferences.
    """
    candidates = []
    
    for n_layers in [8, 12, 16, 24, 32, 40]:
        for d_model in [2048, 2560, 3072, 4096, 5120]:
            # Estimate parameters
            params = estimate_model_size(d_model, n_layers, 32000)['total']
            
            # Training compute: 6 * params * tokens
            required_compute = 6 * params * target_tokens
            
            if required_compute <= compute_budget:
                # Quality estimate (simplified)
                # Larger models are more parameter-efficient
                quality = (params / 1e9) ** 0.2  # Diminishing returns
                
                efficiency = quality / (required_compute / compute_budget)
                
                candidates.append({
                    'n_layers': n_layers,
                    'd_model': d_model,
                    'params': params,
                    'quality_estimate': quality,
                    'efficiency': efficiency,
                    'compute_utilization': required_compute / compute_budget
                })
    
    return sorted(candidates, key=lambda x: x['efficiency'], reverse=True)

Depth vs Width Tradeoffs

def compare_depth_width(total_params: float, 
                       tokenizer_vocab: int = 32000) -> List[Dict]:
    """
    Compare different depth/width configurations with same parameter budget.
    """
    configurations = []
    
    # Various depth/width ratios
    for depth_ratio in [0.5, 1.0, 1.5, 2.0]:  # Width relative to depth
        # Solve for d_model and n_layers given constraints
        
        # Simple model: params ~ 12 * d_model^2 * n_layers + vocab * d_model
        # Ignoring vocab for now (small relative to transformer params)
        
        # At depth_ratio = 1, equal contribution from width and depth
        # width contribution ~ d_model^2
        # depth contribution ~ n_layers * d_model
        
        # Let's use practical configurations
        if depth_ratio <= 1.0:
            d_model = 4096
            n_layers = int(32 * depth_ratio)
        else:
            d_model = int(4096 * (depth_ratio ** 0.5))
            n_layers = int(32 * depth_ratio ** 0.5)
        
        params = estimate_model_size(d_model, n_layers, tokenizer_vocab)['total']
        
        # Rough quality estimate: depth helps more for reasoning
        # width helps more for knowledge storage
        quality_depth = n_layers ** 0.1
        quality_width = d_model ** 0.05
        
        configurations.append({
            'd_model': d_model,
            'n_layers': n_layers,
            'params': params,
            'depth_ratio': depth_ratio,
            'quality_estimate': quality_depth * quality_width,
            'memory_per_token_gb': estimate_kv_memory(d_model, n_layers, tokenizer_vocab)
        })
    
    return configurations

def estimate_kv_memory(d_model, n_layers, seq_len, n_kv_heads=8, head_dim=128):
    """Estimate KV cache memory per token"""
    # Per token KV: 2 * n_layers * n_kv_heads * head_dim * 2 (k + v) * 2 bytes (bf16)
    kv_per_token = 2 * n_layers * n_kv_heads * head_dim * 4 / 1e9
    return kv_per_token * seq_len

Failure Mode: Ignoring Vocabulary Size

# BUG: Forgetting embedding parameters
def incorrect_param_estimate(d_model, n_layers):
    # Only counting transformer params
    return 12 * d_model * d_model * n_layers + 4 * d_model * d_model * 4 * n_layers

# For vocab=100k and d_model=4096:
# Incorrect: ~1.9B params (without embeddings)
# Correct: ~2.5B params (with embeddings)
# 25% underestimate!

# At scale:
# 70B parameters might actually be 75B if vocab is larger
# This affects compute calculations significantly

# FIX: Always include all parameter sources
def complete_param_estimate(d_model, n_layers, vocab_size):
    transformer = 12 * d_model * d_model * n_layers  # attention
    transformer += 8 * d_model * d_model * 4 * n_layers  # ffn
    transformer += 4 * d_model * n_layers  # layer norms
    
    embeddings = vocab_size * d_model  # embedding
    embeddings += vocab_size * d_model  # lm_head (often tied, but allocated)
    
    return transformer + embeddings