18. End-to-End Optimization Project

Chapter 18 of 18 · 25 min

This chapter combines all optimization techniques into a complete optimization pipeline. Starting with a baseline model, we apply techniques sequentially, measuring the impact of each change.

Project setup:

# Environment setup
conda create -n optimization python=3.10
conda activate optimization

# Install dependencies
pip install torch transformers accelerate
pip install auto-gptq awq vllm
pip install flash-attn

# Verify GPU access
python -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
# Expected: True

Baseline measurement:

# baseline_measurement.py
import time
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "meta-llama/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def benchmark(prompt, max_new_tokens=128):
    start = time.time()
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
        )
    
    elapsed = time.time() - start
    return elapsed, tokenizer.decode(outputs[0])

# Measure baseline
prompt = "Explain the concept of attention mechanisms in transformers."
latency, _ = benchmark(prompt)
print(f"Baseline latency: {latency:.2f}s")

Step 1: Quantization with GPTQ:

# step1_quantize.py
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig

model = AutoGPTQForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    torch_dtype=torch.float16,
    device_map="auto"
)

quantize_config = BaseQuantizeConfig(
    bits=4,
    group_size=128,
    desc_act=True,
)

model.quantize(get_calibration_samples(), quantize_config=quantize_config)
model.save_pretrained("llama-2-7b-gptq")

# Measure improved latency
model = AutoModelForCausalLM.from_pretrained(
    "llama-2-7b-gptq",
    device_map="auto",
    torch_dtype=torch.float16,
)
# Expected: 2-3x speedup, ~70% memory reduction

Step 2: Flash Attention integration:

# step2_flash_attn.py
model = AutoModelForCausalLM.from_pretrained(
    "llama-2-7b-gptq",
    device_map="auto",
    attn_implementation="flash_attention_2",  # Enable Flash Attention
)
# Expected: Additional 10-20% speedup, lower memory for long contexts

Step 3: vLLM serving:

# step3_vllm.py
from vllm import LLM, SamplingParams

llm = LLM(
    model="llama-2-7b-gptq",
    gpu_memory_utilization=0.90,
    max_model_len=8192,
    enable_prefix_caching=True,
    max_num_seqs=256,
)

sampling_params = SamplingParams(
    temperature=0.8,
    top_p=0.95,
    max_tokens=512,
)

# Batch throughput test
prompts = [prompt] * 64
start = time.time()
outputs = llm.generate(prompts, sampling_params)
elapsed = time.time() - start

throughput = (64 * 512) / elapsed  # tokens per second
print(f"Batch throughput: {throughput:.0f} tokens/s")

Step 4: Speculative decoding:

# step4_speculative.py
llm = LLM(
    model="meta-llama/Llama-2-70b-hf",
    speculative_model="meta-llama/Llama-2-7b-hf",
    num_speculative_tokens=4,
    tensor_parallel_size=2,
)

outputs = llm.generate([prompt] * 16, sampling_params)
# Expected: 2-3x speedup if draft model has > 60% acceptance rate

Optimization results tracking:

# optimization_log.py
import json

results = {
    "baseline": {"latency": 4.2, "memory_gb": 14},
    "gptq_4bit": {"latency": 1.8, "memory_gb": 4.5},
    "flash_attn": {"latency": 1.4, "memory_gb": 4.2},
    "vllm_serving": {"latency": 1.2, "throughput": 1500},
    "speculative_decoding": {"latency": 0.6, "throughput": 3500},
}

# Calculate total improvement
baseline_latency = results["baseline"]["latency"]
final_latency = results["speculative_decoding"]["latency"]
improvement = baseline_latency / final_latency

print(f"Total improvement: {improvement:.1f}x faster")
print(f"Memory reduction: {14 / results['gptq_4bit']['memory_gb']:.1f}x smaller")

Production deployment checklist:

# deployment_checklist.yaml
optimization_complete:
  - quantization_applied: true
  - flash_attention_enabled: true
  - serving_engine_configured: true
  - speculative_decoding_tested: true
  
performance_targets:
  latency_p50: "< 50ms"
  latency_p99: "< 200ms"
  throughput: "> 1000 tokens/s"
  
validation:
  - perplexity_within_5%_of_baseline: true
  - task_accuracy_within_3%_of_baseline: true
  - memory_within_budget: true
  
monitoring:
  - gpu_utilization_tracked: true
  - cache_hit_rates_monitored: true
  - latency_percentiles_logged: true
EXERCISE

Build a complete optimization pipeline for a model relevant to your use case. Document each step's impact on latency, memory, and accuracy. Deploy the optimized system and validate performance under realistic load. This course provides the complete foundation for optimizing local LLM inference. The techniques build upon each other: quantization reduces memory requirements, enabling longer contexts; Flash Attention and PagedAttention improve memory efficiency; speculative decoding and batching maximize hardware utilization. Combined, these optimizations transform impractical deployments into production-ready systems.