18. End-to-End Optimization Project
This chapter combines all optimization techniques into a complete optimization pipeline. Starting with a baseline model, we apply techniques sequentially, measuring the impact of each change.
Project setup:
# Environment setup
conda create -n optimization python=3.10
conda activate optimization
# Install dependencies
pip install torch transformers accelerate
pip install auto-gptq awq vllm
pip install flash-attn
# Verify GPU access
python -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
# Expected: True
Baseline measurement:
# baseline_measurement.py
import time
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "meta-llama/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
def benchmark(prompt, max_new_tokens=128):
start = time.time()
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=False,
)
elapsed = time.time() - start
return elapsed, tokenizer.decode(outputs[0])
# Measure baseline
prompt = "Explain the concept of attention mechanisms in transformers."
latency, _ = benchmark(prompt)
print(f"Baseline latency: {latency:.2f}s")
Step 1: Quantization with GPTQ:
# step1_quantize.py
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
model = AutoGPTQForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
torch_dtype=torch.float16,
device_map="auto"
)
quantize_config = BaseQuantizeConfig(
bits=4,
group_size=128,
desc_act=True,
)
model.quantize(get_calibration_samples(), quantize_config=quantize_config)
model.save_pretrained("llama-2-7b-gptq")
# Measure improved latency
model = AutoModelForCausalLM.from_pretrained(
"llama-2-7b-gptq",
device_map="auto",
torch_dtype=torch.float16,
)
# Expected: 2-3x speedup, ~70% memory reduction
Step 2: Flash Attention integration:
# step2_flash_attn.py
model = AutoModelForCausalLM.from_pretrained(
"llama-2-7b-gptq",
device_map="auto",
attn_implementation="flash_attention_2", # Enable Flash Attention
)
# Expected: Additional 10-20% speedup, lower memory for long contexts
Step 3: vLLM serving:
# step3_vllm.py
from vllm import LLM, SamplingParams
llm = LLM(
model="llama-2-7b-gptq",
gpu_memory_utilization=0.90,
max_model_len=8192,
enable_prefix_caching=True,
max_num_seqs=256,
)
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
max_tokens=512,
)
# Batch throughput test
prompts = [prompt] * 64
start = time.time()
outputs = llm.generate(prompts, sampling_params)
elapsed = time.time() - start
throughput = (64 * 512) / elapsed # tokens per second
print(f"Batch throughput: {throughput:.0f} tokens/s")
Step 4: Speculative decoding:
# step4_speculative.py
llm = LLM(
model="meta-llama/Llama-2-70b-hf",
speculative_model="meta-llama/Llama-2-7b-hf",
num_speculative_tokens=4,
tensor_parallel_size=2,
)
outputs = llm.generate([prompt] * 16, sampling_params)
# Expected: 2-3x speedup if draft model has > 60% acceptance rate
Optimization results tracking:
# optimization_log.py
import json
results = {
"baseline": {"latency": 4.2, "memory_gb": 14},
"gptq_4bit": {"latency": 1.8, "memory_gb": 4.5},
"flash_attn": {"latency": 1.4, "memory_gb": 4.2},
"vllm_serving": {"latency": 1.2, "throughput": 1500},
"speculative_decoding": {"latency": 0.6, "throughput": 3500},
}
# Calculate total improvement
baseline_latency = results["baseline"]["latency"]
final_latency = results["speculative_decoding"]["latency"]
improvement = baseline_latency / final_latency
print(f"Total improvement: {improvement:.1f}x faster")
print(f"Memory reduction: {14 / results['gptq_4bit']['memory_gb']:.1f}x smaller")
Production deployment checklist:
# deployment_checklist.yaml
optimization_complete:
- quantization_applied: true
- flash_attention_enabled: true
- serving_engine_configured: true
- speculative_decoding_tested: true
performance_targets:
latency_p50: "< 50ms"
latency_p99: "< 200ms"
throughput: "> 1000 tokens/s"
validation:
- perplexity_within_5%_of_baseline: true
- task_accuracy_within_3%_of_baseline: true
- memory_within_budget: true
monitoring:
- gpu_utilization_tracked: true
- cache_hit_rates_monitored: true
- latency_percentiles_logged: true
Build a complete optimization pipeline for a model relevant to your use case. Document each step's impact on latency, memory, and accuracy. Deploy the optimized system and validate performance under realistic load. This course provides the complete foundation for optimizing local LLM inference. The techniques build upon each other: quantization reduces memory requirements, enabling longer contexts; Flash Attention and PagedAttention improve memory efficiency; speculative decoding and batching maximize hardware utilization. Combined, these optimizations transform impractical deployments into production-ready systems.