Data Augmentation — Advanced NLP with Local Models (Chapter 15)

Data augmentation expands training sets by generating modified versions of existing examples. For NLP, augmentation techniques range from simple token replacement to sophisticated generative models.

Back-Translation Augmentation

Translating text to another language and back creates paraphrase-like variants:

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

class BackTranslationAugmenter:
    def __init__(self, src_lang="en", pivot_lang="fr"):
        self.src_lang = src_lang
        self.pivot_lang = pivot_lang
        
        # Source to pivot language model
        self.to_pivot = AutoModelForSeq2SeqLM.from_pretrained(
            f"Helsinki-NLP opus-mt-{src_lang}-{pivot_lang}"
        )
        self.to_pivot_tokenizer = AutoTokenizer.from_pretrained(
            f"Helsinki-NLP/opus-mt-{src_lang}-{pivot_lang}"
        )
        
        # Pivot to source language model
        self.to_src = AutoModelForSeq2SeqLM.from_pretrained(
            f"Helsinki-NLP/opus-mt-{pivot_lang}-{src_lang}"
        )
        self.to_src_tokenizer = AutoTokenizer.from_pretrained(
            f"Helsinki-NLP/opus-mt-{pivot_lang}-{src_lang}"
        )
    
    def translate(self, text: str, model, tokenizer) -> str:
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = model.generate(**inputs, max_length=512)
        return tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    def augment(self, text: str) -> str:
        """Back-translate through pivot language."""
        pivot_text = self.translate(text, self.to_pivot, self.to_pivot_tokenizer)
        back_translated = self.translate(pivot_text, self.to_src, self.to_src_tokenizer)
        return back_translated

LLM-Based Paraphrase Generation

Modern LLMs generate diverse paraphrases with controlled attributes:

from transformers import AutoModelForCausalLM, AutoTokenizer

class LLMParaphraseAugmenter:
    def __init__(self, model_path="./models/llama-2-13b-chat-hf"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForCausalLM.from_pretrained(model_path)
    
    def paraphrase(self, text: str, n_variants: int = 3, 
                   diversity: float = 0.7) -> list[str]:
        """Generate paraphrased variants."""
        prompt = f"""Generate {n_variants} diverse paraphrases of this text.
Vary sentence structure and word choice while preserving meaning.

Text: {text}

Paraphrases:"""
        
        inputs = self.tokenizer(prompt, return_tensors="pt", max_length=512)
        outputs = self.model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=diversity,
            do_sample=True,
            top_p=0.9,
            num_return_sequences=n_variants
        )
        
        responses = []
        for output in outputs:
            text_response = self.tokenizer.decode(output, skip_special_tokens=True)
            # Extract paraphrases from response
            paraphrases = self._extract_paraphrases(text_response)
            responses.extend(paraphrases)
        
        return responses[:n_variants]
    
    def _extract_paraphrases(self, response: str) -> list[str]:
        """Parse paraphrases from LLM output."""
        lines = response.split('\n')
        paraphrases = []
        for line in lines:
            line = line.strip()
            if line and not line.startswith('Paraphrase') == False:
                # Remove numbering if present
                clean = line.lstrip('0123456789.-) ')
                if clean and len(clean) > 10:
                    paraphrases.append(clean)
        return paraphrases

Synonym Replacement with FastText

Efficient word-level augmentation using vector-based synonyms:

import fasttext
import random

class FastTextAugmenter:
    def __init__(self, model_path="cc.en.300.bin"):
        self.model = fasttext.load_model(model_path)
        self.stopwords = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 
                          'been', 'being', 'have', 'has', 'had', 'do', 'does', 
                          'did', 'will', 'would', 'could', 'should', 'may', 'might'}
    
    def augment(self, text: str, aug_ratio: float = 0.15, top_k: int = 5) -> str:
        """Replace words with similar words based on FastText embeddings."""
        words = text.split()
        n_aug = int(len(words) * aug_ratio)
        
        # Select random words to augment (excluding stopwords)
        candidates = [i for i, w in enumerate(words) if w.lower() not in self.stopwords]
        if not candidates:
            return text
        
        aug_indices = random.sample(candidates, min(n_aug, len(candidates)))
        
        for idx in aug_indices:
            word = words[idx]
            similar = self._get_similar_words(word, top_k)
            if similar:
                words[idx] = random.choice(similar)
        
        return ' '.join(words)
    
    def _get_similar_words(self, word: str, top_k: int) -> list[str]:
        """Get similar words from FastText model."""
        neighbors = self.model.get_nearest_neighbors(word)
        return [w for w, score in neighbors if score > 0.7][:top_k]