15. Data Augmentation
Chapter 15 of 18 · 20 min
Data augmentation expands training sets by generating modified versions of existing examples. For NLP, augmentation techniques range from simple token replacement to sophisticated generative models.
Back-Translation Augmentation
Translating text to another language and back creates paraphrase-like variants:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
class BackTranslationAugmenter:
def __init__(self, src_lang="en", pivot_lang="fr"):
self.src_lang = src_lang
self.pivot_lang = pivot_lang
# Source to pivot language model
self.to_pivot = AutoModelForSeq2SeqLM.from_pretrained(
f"Helsinki-NLP opus-mt-{src_lang}-{pivot_lang}"
)
self.to_pivot_tokenizer = AutoTokenizer.from_pretrained(
f"Helsinki-NLP/opus-mt-{src_lang}-{pivot_lang}"
)
# Pivot to source language model
self.to_src = AutoModelForSeq2SeqLM.from_pretrained(
f"Helsinki-NLP/opus-mt-{pivot_lang}-{src_lang}"
)
self.to_src_tokenizer = AutoTokenizer.from_pretrained(
f"Helsinki-NLP/opus-mt-{pivot_lang}-{src_lang}"
)
def translate(self, text: str, model, tokenizer) -> str:
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
outputs = model.generate(**inputs, max_length=512)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
def augment(self, text: str) -> str:
"""Back-translate through pivot language."""
pivot_text = self.translate(text, self.to_pivot, self.to_pivot_tokenizer)
back_translated = self.translate(pivot_text, self.to_src, self.to_src_tokenizer)
return back_translated
LLM-Based Paraphrase Generation
Modern LLMs generate diverse paraphrases with controlled attributes:
from transformers import AutoModelForCausalLM, AutoTokenizer
class LLMParaphraseAugmenter:
def __init__(self, model_path="./models/llama-2-13b-chat-hf"):
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
self.model = AutoModelForCausalLM.from_pretrained(model_path)
def paraphrase(self, text: str, n_variants: int = 3,
diversity: float = 0.7) -> list[str]:
"""Generate paraphrased variants."""
prompt = f"""Generate {n_variants} diverse paraphrases of this text.
Vary sentence structure and word choice while preserving meaning.
Text: {text}
Paraphrases:"""
inputs = self.tokenizer(prompt, return_tensors="pt", max_length=512)
outputs = self.model.generate(
**inputs,
max_new_tokens=200,
temperature=diversity,
do_sample=True,
top_p=0.9,
num_return_sequences=n_variants
)
responses = []
for output in outputs:
text_response = self.tokenizer.decode(output, skip_special_tokens=True)
# Extract paraphrases from response
paraphrases = self._extract_paraphrases(text_response)
responses.extend(paraphrases)
return responses[:n_variants]
def _extract_paraphrases(self, response: str) -> list[str]:
"""Parse paraphrases from LLM output."""
lines = response.split('\n')
paraphrases = []
for line in lines:
line = line.strip()
if line and not line.startswith('Paraphrase') == False:
# Remove numbering if present
clean = line.lstrip('0123456789.-) ')
if clean and len(clean) > 10:
paraphrases.append(clean)
return paraphrases
Synonym Replacement with FastText
Efficient word-level augmentation using vector-based synonyms:
import fasttext
import random
class FastTextAugmenter:
def __init__(self, model_path="cc.en.300.bin"):
self.model = fasttext.load_model(model_path)
self.stopwords = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be',
'been', 'being', 'have', 'has', 'had', 'do', 'does',
'did', 'will', 'would', 'could', 'should', 'may', 'might'}
def augment(self, text: str, aug_ratio: float = 0.15, top_k: int = 5) -> str:
"""Replace words with similar words based on FastText embeddings."""
words = text.split()
n_aug = int(len(words) * aug_ratio)
# Select random words to augment (excluding stopwords)
candidates = [i for i, w in enumerate(words) if w.lower() not in self.stopwords]
if not candidates:
return text
aug_indices = random.sample(candidates, min(n_aug, len(candidates)))
for idx in aug_indices:
word = words[idx]
similar = self._get_similar_words(word, top_k)
if similar:
words[idx] = random.choice(similar)
return ' '.join(words)
def _get_similar_words(self, word: str, top_k: int) -> list[str]:
"""Get similar words from FastText model."""
neighbors = self.model.get_nearest_neighbors(word)
return [w for w, score in neighbors if score > 0.7][:top_k]
EXERCISE
Implement a data augmentation pipeline that combines multiple techniques (back-translation, synonym replacement, random insertion/deletion) with quality filtering to select only high-quality augmented samples.