24. Model Alignment Pipeline Project
Chapter 24 of 24 · 25 min
This final chapter integrates all previous concepts into a complete alignment pipeline project that you can implement from scratch.
Project Overview
Build a complete alignment pipeline that:
- Starts with a base language model
- Generates preference data
- Trains using DPO
- Evaluates alignment quality
Project Structure
alignment_project/
├── config/
│ ├── training_config.yaml
│ └── model_config.yaml
├── data/
│ ├── prompts.jsonl
│ └── preferences.jsonl
├── src/
│ ├── data_processing.py
│ ├── reward_model.py
│ ├── dpo_trainer.py
│ └── evaluator.py
├── scripts/
│ ├── generate_preferences.py
│ ├── train_alignment.py
│ └── evaluate.py
└── outputs/
└── aligned_model/
Step 1: Configuration
# config/training_config.yaml
model:
name: "gpt2" # Use smaller model for demonstration
max_length: 512
training:
method: "dpo"
batch_size: 8
learning_rate: 1e-5
epochs: 3
beta: 0.1 # DPO temperature parameter
gradient_accumulation: 4
data:
train_ratio: 0.9
max_prompt_length: 256
max_response_length: 256
evaluation:
benchmarks:
- "preference_agreement"
- "safety_check"
- "helpfulness_rating"
sample_size: 100
Step 2: Data Generation
# scripts/generate_preferences.py
import json
from transformers import pipeline
from tqdm import tqdm
def generate_preference_data(prompts, model, num_samples=1000):
"""
Generate preference pairs by sampling multiple responses
and having a reward model rank them.
"""
preference_data = []
for prompt in tqdm(prompts[:num_samples]):
# Sample two different responses using temperature variation
response_a = model(
prompt,
temperature=0.7,
max_new_tokens=150,
do_sample=True
)
response_b = model(
prompt,
temperature=1.2,
max_new_tokens=150,
do_sample=True
)
# Score both responses
score_a = reward_model_score(response_a)
score_b = reward_model_score(response_b)
# Create preference pair
if score_a >= score_b:
preference_data.append({
"prompt": prompt,
"chosen": response_a,
"rejected": response_b
})
else:
preference_data.append({
"prompt": prompt,
"chosen": response_b,
"rejected": response_a
})
return preference_data
if __name__ == "__main__":
prompts = load_prompts("data/prompts.jsonl")
generator = pipeline("text-generation", model="gpt2")
preferences = generate_preference_data(prompts)
save_preferences(preferences, "data/preferences.jsonl")
Step 3: DPO Training Implementation
# src/dpo_trainer.py
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import Trainer
class DPOTrainer(Trainer):
def __init__(self, *args, beta=0.1, **kwargs):
super().__init__(*args, **kwargs)
self.beta = beta
def compute_loss(self, model, batch, return_outputs=False):
prompt = batch["prompt"]
chosen = batch["chosen"]
rejected = batch["rejected"]
# Policy log probs
policy_chosen = model(prompt, chosen).log_probs
policy_rejected = model(prompt, rejected).log_probs
# Reference log probs
with torch.no_grad():
ref_chosen = self.ref_model(prompt, chosen).log_probs
ref_rejected = self.ref_model(prompt, rejected).log_probs
# DPO loss
policy_ratio = policy_chosen - policy_rejected
ref_ratio = ref_chosen - ref_rejected
loss = -F.logsigmoid(self.beta * (policy_ratio - ref_ratio)).mean()
return (loss, {"policy_ratio": policy_ratio.mean()}) if return_outputs else loss
def train_alignment():
from datasets import load_dataset
# Load data
dataset = load_dataset("json", data_files="data/preferences.jsonl")
# Initialize model and reference
model = AutoModelForCausalLM.from_pretrained("gpt2")
ref_model = AutoModelForCausalLM.from_pretrained("gpt2")
ref_model.eval()
# Initialize trainer
trainer = DPOTrainer(
model=model,
ref_model=ref_model,
train_dataset=dataset["train"],
args=TrainingArguments(
output_dir="outputs/aligned_model",
num_train_epochs=3,
per_device_train_batch_size=8,
learning_rate=1e-5,
warmup_steps=100
),
beta=0.1
)
# Train
trainer.train()
# Save
trainer.save_model("outputs/aligned_model")
Step 4: Evaluation
# scripts/evaluate.py
def evaluate_alignment(model, eval_data):
"""
Multi-metric alignment evaluation.
"""
results = {
"preference_agreement": {},
"safety_metrics": {},
"helpfulness_metrics": {}
}
# Preference agreement
preference_pairs = load_eval_pairs()
agreement = measure_preference_agreement(model, preference_pairs)
results["preference_agreement"] = agreement
# Safety metrics
safety_prompts = load_safety_prompts()
safety_score = evaluate_safety(model, safety_prompts)
results["safety_metrics"] = safety_score
# Helpfulness metrics
helpful_prompts = load_helpful_prompts()
helpful_score = evaluate_helpfulness(model, helpful_prompts)
results["helpfulness_metrics"] = helpful_score
return results
def print_evaluation_report(results):
print("=" * 60)
print("ALIGNMENT EVALUATION REPORT")
print("=" * 60)
print(f"\nPreference Agreement: {results['preference_agreement']['rate']:.1%}")
print(f"Safety Score: {results['safety_metrics']['score']:.1%}")
print(f"Helpfulness Score: {results['helpfulness_metrics']['score']:.1%}")
print("\nDetailed Metrics:")
for category, metrics in results.items():
print(f"\n {category}:")
for key, value in metrics.items():
print(f" {key}: {value}")
Step 5: Running the Pipeline
#!/bin/bash
# run_pipeline.sh
set -e
echo "Step 1: Generate preference data..."
python scripts/generate_preferences.py \
--prompts data/prompts.jsonl \
--output data/preferences.jsonl
echo "Step 2: Train alignment with DPO..."
python scripts/train_alignment.py \
--config config/training_config.yaml \
--output outputs/aligned_model
echo "Step 3: Evaluate alignment..."
python scripts/evaluate.py \
--model outputs/aligned_model \
--eval-data data/eval_pairs.jsonl \
--output outputs/evaluation_report.json
echo "Step 4: Generate report..."
python -c "
import json
with open('outputs/evaluation_report.json') as f:
results = json.load(f)
print_evaluation_report(results)
"
Expected Outcomes
After completing the pipeline:
| Metric | Base Model | Aligned Model | Target |
|---|---|---|---|
| Preference agreement | ~50% | 65-75% | >70% |
| Safety score | Variable | >85% | >80% |
| Helpfulness score | Variable | >75% | >70% |
Troubleshooting
Common Issues:
- Training instability: Reduce learning rate or beta parameter
- Mode collapse: Check data quality; ensure diverse prompts
- Over-refusal: Adjust preference data ratios
- Capability loss: Mix in capability-preserving examples
EXERCISE
Run the complete pipeline with a small model (gpt2 or equivalent). Measure alignment improvement and identify one weakness in the final model. Then implement a fix and measure improvement.