21. Model Sizing
Chapter 21 of 24 · 30 min
Model sizing determines the optimal balance between parameters, compute, and quality. Wrong sizing wastes resources or produces underperforming models.
Determining Optimal Model Size
def estimate_model_size(d_model, n_layers, vocab_size, n_kv_heads=None, n_heads=32):
"""
Estimate total parameters for a decoder-only transformer.
"""
head_dim = d_model // n_heads
# Embedding
embedding_params = vocab_size * d_model
# Attention: Q, K, V, O projections
# Q: d_model -> n_heads * head_dim
# K: d_model -> n_kv_heads * head_dim (or n_heads if MHA)
# V: d_model -> n_kv_heads * head_dim
# O: n_heads * head_dim -> d_model
kv_heads = n_kv_heads if n_kv_heads else n_heads
attn_params = (n_heads * d_model * head_dim + # Q
kv_heads * d_model * head_dim + # K
kv_heads * d_model * head_dim + # V
n_heads * head_dim * d_model) * n_layers # O
# FFN: typically 4x expansion
d_ff = d_model * 4
ffn_params = (d_model * d_ff + d_ff * d_model) * n_layers
# Layer norms: 2 per layer + final
ln_params = (2 * d_model * n_layers) + (2 * d_model)
return {
'embedding': embedding_params,
'attention': attn_params,
'ffn': ffn_params,
'layer_norms': ln_params,
'total': embedding_params + attn_params + ffn_params + ln_params
}
def format_params(n):
"""Format parameter count human-readable"""
if n >= 1e12:
return f"{n/1e12:.1f}T"
elif n >= 1e9:
return f"{n/1e9:.1f}B"
elif n >= 1e6:
return f"{n/1e6:.1f}M"
else:
return f"{n:.0f}"
# Example: Llama 7B
size = estimate_model_size(
d_model=4096,
n_layers=32,
vocab_size=32000,
n_kv_heads=8,
n_heads=32
)
print(f"Total parameters: {format_params(size['total'])}")
Compute Budget Allocation
def allocate_compute_budget(total_flops: float, config: Dict) -> Dict:
"""
Allocate compute budget across architecture components.
"""
n_layers = config['n_layers']
d_model = config['d_model']
d_ff = config.get('d_ff', d_model * 4)
# Per-layer FLOPs estimate
# Attention: 4 * d_model * d_model * seq_len * n_layers (approx)
# FFN: 6 * d_model * d_ff * seq_len * n_layers (approx)
# This is for a single forward pass
# Training requires 6x (forward + backward)
attn_flops_per_layer = 4 * d_model * d_model * n_layers
ffn_flops_per_layer = 6 * d_model * d_ff * n_layers
# Estimate cost ratios
attn_fraction = attn_flops_per_layer / (attn_flops_per_layer + ffn_flops_per_layer)
return {
'attention_flops': total_flops * attn_fraction,
'ffn_flops': total_flops * (1 - attn_fraction),
'ratio': attn_fraction / (1 - attn_fraction)
}
def optimize_size_for_budget(compute_budget: float,
target_tokens: int,
quality_weight: float = 0.5) -> Dict:
"""
Find optimal model size given compute budget and quality preferences.
"""
candidates = []
for n_layers in [8, 12, 16, 24, 32, 40]:
for d_model in [2048, 2560, 3072, 4096, 5120]:
# Estimate parameters
params = estimate_model_size(d_model, n_layers, 32000)['total']
# Training compute: 6 * params * tokens
required_compute = 6 * params * target_tokens
if required_compute <= compute_budget:
# Quality estimate (simplified)
# Larger models are more parameter-efficient
quality = (params / 1e9) ** 0.2 # Diminishing returns
efficiency = quality / (required_compute / compute_budget)
candidates.append({
'n_layers': n_layers,
'd_model': d_model,
'params': params,
'quality_estimate': quality,
'efficiency': efficiency,
'compute_utilization': required_compute / compute_budget
})
return sorted(candidates, key=lambda x: x['efficiency'], reverse=True)
Depth vs Width Tradeoffs
def compare_depth_width(total_params: float,
tokenizer_vocab: int = 32000) -> List[Dict]:
"""
Compare different depth/width configurations with same parameter budget.
"""
configurations = []
# Various depth/width ratios
for depth_ratio in [0.5, 1.0, 1.5, 2.0]: # Width relative to depth
# Solve for d_model and n_layers given constraints
# Simple model: params ~ 12 * d_model^2 * n_layers + vocab * d_model
# Ignoring vocab for now (small relative to transformer params)
# At depth_ratio = 1, equal contribution from width and depth
# width contribution ~ d_model^2
# depth contribution ~ n_layers * d_model
# Let's use practical configurations
if depth_ratio <= 1.0:
d_model = 4096
n_layers = int(32 * depth_ratio)
else:
d_model = int(4096 * (depth_ratio ** 0.5))
n_layers = int(32 * depth_ratio ** 0.5)
params = estimate_model_size(d_model, n_layers, tokenizer_vocab)['total']
# Rough quality estimate: depth helps more for reasoning
# width helps more for knowledge storage
quality_depth = n_layers ** 0.1
quality_width = d_model ** 0.05
configurations.append({
'd_model': d_model,
'n_layers': n_layers,
'params': params,
'depth_ratio': depth_ratio,
'quality_estimate': quality_depth * quality_width,
'memory_per_token_gb': estimate_kv_memory(d_model, n_layers, tokenizer_vocab)
})
return configurations
def estimate_kv_memory(d_model, n_layers, seq_len, n_kv_heads=8, head_dim=128):
"""Estimate KV cache memory per token"""
# Per token KV: 2 * n_layers * n_kv_heads * head_dim * 2 (k + v) * 2 bytes (bf16)
kv_per_token = 2 * n_layers * n_kv_heads * head_dim * 4 / 1e9
return kv_per_token * seq_len
Failure Mode: Ignoring Vocabulary Size
# BUG: Forgetting embedding parameters
def incorrect_param_estimate(d_model, n_layers):
# Only counting transformer params
return 12 * d_model * d_model * n_layers + 4 * d_model * d_model * 4 * n_layers
# For vocab=100k and d_model=4096:
# Incorrect: ~1.9B params (without embeddings)
# Correct: ~2.5B params (with embeddings)
# 25% underestimate!
# At scale:
# 70B parameters might actually be 75B if vocab is larger
# This affects compute calculations significantly
# FIX: Always include all parameter sources
def complete_param_estimate(d_model, n_layers, vocab_size):
transformer = 12 * d_model * d_model * n_layers # attention
transformer += 8 * d_model * d_model * 4 * n_layers # ffn
transformer += 4 * d_model * n_layers # layer norms
embeddings = vocab_size * d_model # embedding
embeddings += vocab_size * d_model # lm_head (often tied, but allocated)
return transformer + embeddings
EXERCISE
Given a compute budget of 10²⁴ FLOPs, generate all model configurations that fit within this budget. Rank by estimated quality and create a table showing the tradeoff between parameter count and model depth.