End-to-End Optimization Project — Model Optimization for Local Inference (Chapter 18)

This chapter combines all optimization techniques into a complete optimization pipeline. Starting with a baseline model, we apply techniques sequentially, measuring the impact of each change.

Project setup:

# Environment setup
conda create -n optimization python=3.10
conda activate optimization

# Install dependencies
pip install torch transformers accelerate
pip install auto-gptq awq vllm
pip install flash-attn

# Verify GPU access
python -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
# Expected: True

Baseline measurement:

# baseline_measurement.py
import time
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "meta-llama/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def benchmark(prompt, max_new_tokens=128):
    start = time.time()
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
        )
    
    elapsed = time.time() - start
    return elapsed, tokenizer.decode(outputs[0])

# Measure baseline
prompt = "Explain the concept of attention mechanisms in transformers."
latency, _ = benchmark(prompt)
print(f"Baseline latency: {latency:.2f}s")

Step 1: Quantization with GPTQ:

# step1_quantize.py
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig

model = AutoGPTQForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    torch_dtype=torch.float16,
    device_map="auto"
)

quantize_config = BaseQuantizeConfig(
    bits=4,
    group_size=128,
    desc_act=True,
)

model.quantize(get_calibration_samples(), quantize_config=quantize_config)
model.save_pretrained("llama-2-7b-gptq")

# Measure improved latency
model = AutoModelForCausalLM.from_pretrained(
    "llama-2-7b-gptq",
    device_map="auto",
    torch_dtype=torch.float16,
)
# Expected: 2-3x speedup, ~70% memory reduction

Step 2: Flash Attention integration:

# step2_flash_attn.py
model = AutoModelForCausalLM.from_pretrained(
    "llama-2-7b-gptq",
    device_map="auto",
    attn_implementation="flash_attention_2",  # Enable Flash Attention
)
# Expected: Additional 10-20% speedup, lower memory for long contexts

Step 3: vLLM serving:

# step3_vllm.py
from vllm import LLM, SamplingParams

llm = LLM(
    model="llama-2-7b-gptq",
    gpu_memory_utilization=0.90,
    max_model_len=8192,
    enable_prefix_caching=True,
    max_num_seqs=256,
)

sampling_params = SamplingParams(
    temperature=0.8,
    top_p=0.95,
    max_tokens=512,
)

# Batch throughput test
prompts = [prompt] * 64
start = time.time()
outputs = llm.generate(prompts, sampling_params)
elapsed = time.time() - start

throughput = (64 * 512) / elapsed  # tokens per second
print(f"Batch throughput: {throughput:.0f} tokens/s")

Step 4: Speculative decoding:

# step4_speculative.py
llm = LLM(
    model="meta-llama/Llama-2-70b-hf",
    speculative_model="meta-llama/Llama-2-7b-hf",
    num_speculative_tokens=4,
    tensor_parallel_size=2,
)

outputs = llm.generate([prompt] * 16, sampling_params)
# Expected: 2-3x speedup if draft model has > 60% acceptance rate

Optimization results tracking:

# optimization_log.py
import json

results = {
    "baseline": {"latency": 4.2, "memory_gb": 14},
    "gptq_4bit": {"latency": 1.8, "memory_gb": 4.5},
    "flash_attn": {"latency": 1.4, "memory_gb": 4.2},
    "vllm_serving": {"latency": 1.2, "throughput": 1500},
    "speculative_decoding": {"latency": 0.6, "throughput": 3500},
}

# Calculate total improvement
baseline_latency = results["baseline"]["latency"]
final_latency = results["speculative_decoding"]["latency"]
improvement = baseline_latency / final_latency

print(f"Total improvement: {improvement:.1f}x faster")
print(f"Memory reduction: {14 / results['gptq_4bit']['memory_gb']:.1f}x smaller")

Production deployment checklist:

# deployment_checklist.yaml
optimization_complete:
  - quantization_applied: true
  - flash_attention_enabled: true
  - serving_engine_configured: true
  - speculative_decoding_tested: true
  
performance_targets:
  latency_p50: "< 50ms"
  latency_p99: "< 200ms"
  throughput: "> 1000 tokens/s"
  
validation:
  - perplexity_within_5%_of_baseline: true
  - task_accuracy_within_3%_of_baseline: true
  - memory_within_budget: true
  
monitoring:
  - gpu_utilization_tracked: true
  - cache_hit_rates_monitored: true
  - latency_percentiles_logged: true