KEY INSIGHT
This chapter synthesizes the course content through a complete domain adaptation project: fine-tuning a model for legal document analysis. The project demonstrates data preparation, LoRA configuration, training with gradient checkpointing, evaluation, and deployment preparation.
**Project goal**: Fine-tune a 7B parameter model to summarize and extract key provisions from contracts.
### Phase 1: Data Preparation
```python
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer
import json
def prepare_contract_dataset(output_path="./data/contracts"):
"""
Load and format contract dataset.
Expected format: {"text": contract_text, "summary": summary}
"""
# Load raw contracts
contracts = load_dataset("json", data_files="contracts_raw.jsonl", split="train")
def format_contract(example):
return {
"text": f"### Contract\n{example['text']}\n\n### Summary\n",
"summary": example["summary"]
}
contracts = contracts.map(format_contract, remove_columns=contracts.column_names)
contracts.save_to_disk(output_path)
return contracts
def tokenize_contracts(examples, tokenizer, max_length=2048):
"""Tokenize for causal language modeling."""
result = tokenizer(
examples["text"] + examples["summary"],
truncation=True,
max_length=max_length,
padding="max_length"
)
# Labels are the same as input_ids for causal LM
result["labels"] = result["input_ids"].copy()
return result
```
### Phase 2: LoRA Configuration
```python
from peft import LoraConfig, get_peft_model, TaskType
def create_contract_model(model_name="meta-llama/Llama-2-7b-hf"):
"""Initialize model with LoRA for contract analysis."""
# Load base model
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map="auto",
load_in_4bit=True,
)
# Enable gradient checkpointing for memory efficiency
model.gradient_checkpointing_enable()
# LoRA configuration optimized for 7B models
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=16,
lora_alpha=32,
lora_dropout=0.05,
target_modules=[
"q_proj", "v_proj", "k_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"
],
bias="none",
)
model = get_peft_model(model, lora_config)
trainable_params, total_params = get_trainable_stats(model)
print(f"Trainable: {trainable_params:,} / {total_params:,} ({100*trainable_params/total_params:.2f}%)")
return model
def get_trainable_stats(model):
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
return trainable, total
```
### Phase 3: Training Loop
```python
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from accelerate import Accelerator
import torch.nn.functional as F
def train_contract_model(
model,
train_dataset,
eval_dataset,
output_dir="./contract_model",
epochs=3,
batch_size=4,
gradient_accumulation_steps=4,
learning_rate=2e-4,
):
"""Complete training pipeline for contract model."""
accelerator = Accelerator(mixed_precision="fp16")
# Create dataloaders
train_loader = DataLoader(
train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True
)
eval_loader = DataLoader(eval_dataset, batch_size=batch_size)
# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
total_steps = len(train_loader) * epochs // gradient_accumulation_steps
scheduler = get_linear_schedule_with_warmup(
optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps
)
# Prepare with accelerator
model, optimizer, train_loader, eval_loader, scheduler = accelerator.prepare(
model, optimizer, train_loader, eval_loader, scheduler
)
# Training loop
for epoch in range(epochs):
model.train()
for step, batch in enumerate(train_loader):
with accelerator.autocast():
outputs = model(**batch)
loss = outputs.loss / gradient_accumulation_steps
accelerator.backward(loss)
if (step + 1) % gradient_accumulation_steps == 0:
accelerator.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
if step % 100 == 0:
print(f"Epoch {epoch}, Step {step}, Loss: {loss.item():.4f}")
# Evaluation
model.eval()
eval_loss = 0
for batch in eval_loader:
with torch.no_grad():
outputs = model(**batch)
eval_loss += outputs.loss.item()
print(f"Epoch {epoch}: eval_loss={eval_loss/len(eval_loader):.4f}")
# Save checkpoint
accelerator.wait_for_everyone()
unwrapped = accelerator.unwrap_model(model)
unwrapped.save_pretrained(output_dir)
# Execute training
if __name__ == "__main__":
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
train_data = prepare_contract_dataset()
tokenized_train = train_data.map(
lambda x: tokenize_contracts(x, tokenizer), batched=True
)
model = create_contract_model()
train_contract_model(model, tokenized_train["train"], tokenized_train["test"])
```
### Phase 4: Evaluation
```python
def evaluate_contract_model(model, test_cases):
"""
Evaluate on held-out contract summaries.
Metrics: ROUGE-L, factual consistency, completeness
"""
from rouge import Rouge
rouge = Rouge()
results = {"rouge-l": [], "consistency": []}
for contract, reference in test_cases:
# Generate summary
prompt = f"### Contract\n{contract}\n\n### Summary\n"
generated = model.generate(prompt, max_new_tokens=200)
# ROUGE score
scores = rouge.get_scores(generated, reference)
results["rouge-l"].append(scores[0]["rouge-l"]["f"])
return {
"rouge-l": sum(results["rouge-l"]) / len(results["rouge-l"]),
"mean_length": sum(len(s.split()) for s in generated) / len(test_cases)
}
```
### Phase 5: Export for Deployment
```python
def export_contract_model(adapter_path, output_path):
"""Convert fine-tuned model to deployment-ready format."""
# Merge adapter
base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
model = PeftModel.from_pretrained(base_model, adapter_path)
merged = model.merge_and_unload()
# Save in standard format
merged.save_pretrained(output_path)
# Export to GGUF for CPU inference
subprocess.run([
"python", "llama.cpp/convert.py", output_path,
"--outfile", f"{output_path}.gguf",
"--outtype", "f16"
])
```