Output Filtering — AI Safety and Alignment (Chapter 16)

Output filtering validates generated content against safety policies before delivery. Unlike guardrails that constrain generation, filtering operates on completed outputs and enables more nuanced policy application.

Rule-Based and ML Hybrid Filtering

import re
from typing import Optional

class OutputFilter:
    """Layered output filtering system."""
    
    def __init__(self, nsfw_classifier, pii_detector):
        self.nsfw_classifier = nsfw_classifier
        self.pii_detector = pii_detector
        self.blocklist = self._load_blocklist()
        
    def _load_blocklist(self) -> set:
        """Load domain-specific blocked content patterns."""
        return {
            'financial_advice': r'\$[\d,]+.*invest|\d+%.*return',
            'medical_advice': r'diagnosis|take.*mg.*without|prescription.*without',
        }
    
    def check_nsfw(self, text: str) -> tuple[bool, float]:
        """Classify content safety with confidence score."""
        result = self.nsfw_classifier(text)
        is_safe = result['label'] == 'SAFE'
        confidence = result['score'] if is_safe else 1 - result['score']
        return is_safe, confidence
    
    def detect_pii(self, text: str) -> list[dict]:
        """Extract personally identifiable information."""
        return self.pii_detector.run(text)
    
    def check_blocklist(self, text: str) -> Optional[str]:
        """Identify blocklist violations."""
        for category, pattern in self.blocklist.items():
            if re.search(pattern, text, re.IGNORECASE):
                return category
        return None
    
    def filter(
        self, text: str, context: dict
    ) -> tuple[bool, str, Optional[dict]]:
        """
        Return (is_acceptable, filtered_text, violation_report).
        """
        is_safe, confidence = self.check_nsfw(text)
        
        if not is_safe and confidence > 0.95:
            return False, "[Content filtered: safety violation]", {
                'reason': 'nsfw', 'confidence': confidence
            }
        
        pii_matches = self.detect_pii(text)
        if pii_matches and context.get('allow_pii', False) is False:
            filtered = self._redact_pii(text, pii_matches)
            return True, filtered, {'pii_redacted': len(pii_matches)}
        
        blocklist_violation = self.check_blocklist(text)
        if blocklist_violation:
            return False, "[Content filtered: policy violation]", {
                'reason': blocklist_violation
            }
        
        return True, text, None

Local verification checkpoint

Run the smallest example from this chapter in a local workspace and record the package version, runtime, data path, and observed output. If the result depends on model size, vector count, CPU/GPU backend, or available memory, note that constraint beside the exercise so the lesson remains reproducible.