Safety Guardrails — AI Safety and Alignment (Chapter 14)

Safety guardrails intercept harmful outputs before they reach users. Effective guardrails operate at multiple levels: prompt filtering, generation constraints, and response validation.

Prompt Classification Pipeline

from transformers import pipeline

class SafetyGuardrail:
    """Multi-stage safety classification pipeline."""
    
    def __init__(self, harmful_threshold=0.7, uncertain_threshold=0.4):
        self.classifier = pipeline(
            "text-classification",
            model="distilbert-base-uncased-finetuned-sst-2"
        )
        self.harmful_threshold = harmful_threshold
        self.uncertain_threshold = uncertain_threshold
        
    def classify_prompt(self, prompt: str) -> tuple[str, float]:
        """Classify prompt risk level and return category + confidence."""
        result = self.classifier(prompt)[0]
        score = result['score'] if result['label'] == 'NEGATIVE' else 1 - result['score']
        
        if score >= self.harmful_threshold:
            return 'BLOCK', score
        elif score >= self.uncertain_threshold:
            return 'REVIEW', score
        return 'ALLOW', score
    
    def apply_guardrail(self, prompt: str, response: str) -> str:
        """Validate response against known harm patterns."""
        categories = {
            'PI': r'\b(weapon|explosive|bomb)\b',
            'CSAM': r'decade.*younger|minor.*sexual',
            'HATE': r'\bhate\b|\bdehumaniz',
        }
        
        for category, pattern in categories.items():
            if re.search(pattern, response, re.IGNORECASE):
                return f"I can't complete this request. [Category: {category}]"
        
        return response

Token-Level Blocking

class TokenBlocklist:
    """Block generation of specific token sequences."""
    
    def __init__(self, vocab):
        self.vocab = vocab
        self.blocked_prefixes = self._build_prefix_tree()
        
    def _build_prefix_tree(self):
        """Build a prefix trie of blocked phrases."""
        blocked_phrases = [
            "how to create", "instructions for making",
            "step-by-step guide to building"
        ]
        trie = {}
        for phrase in blocked_phrases:
            node = trie
            for char in phrase.lower():
                node = node.setdefault(char, {})
            node['$END$'] = True
        return trie
    
    def should_block_token(self, generated_ids: list[int]) -> bool:
        """Check if current sequence matches a blocked prefix."""
        text = ' '.join(self.vocab.decode(generated_ids).split())
        node = self.blocked_prefixes
        for char in text.lower():
            if char not in node:
                return False
            node = node[char]
        return '$END$' in node

Local verification checkpoint

Run the smallest example from this chapter in a local workspace and record the package version, runtime, data path, and observed output. If the result depends on model size, vector count, CPU/GPU backend, or available memory, note that constraint beside the exercise so the lesson remains reproducible.