16. Output Filtering
Chapter 16 of 18 · 15 min
Output filtering validates generated content against safety policies before delivery. Unlike guardrails that constrain generation, filtering operates on completed outputs and enables more nuanced policy application.
Rule-Based and ML Hybrid Filtering
import re
from typing import Optional
class OutputFilter:
"""Layered output filtering system."""
def __init__(self, nsfw_classifier, pii_detector):
self.nsfw_classifier = nsfw_classifier
self.pii_detector = pii_detector
self.blocklist = self._load_blocklist()
def _load_blocklist(self) -> set:
"""Load domain-specific blocked content patterns."""
return {
'financial_advice': r'\$[\d,]+.*invest|\d+%.*return',
'medical_advice': r'diagnosis|take.*mg.*without|prescription.*without',
}
def check_nsfw(self, text: str) -> tuple[bool, float]:
"""Classify content safety with confidence score."""
result = self.nsfw_classifier(text)
is_safe = result['label'] == 'SAFE'
confidence = result['score'] if is_safe else 1 - result['score']
return is_safe, confidence
def detect_pii(self, text: str) -> list[dict]:
"""Extract personally identifiable information."""
return self.pii_detector.run(text)
def check_blocklist(self, text: str) -> Optional[str]:
"""Identify blocklist violations."""
for category, pattern in self.blocklist.items():
if re.search(pattern, text, re.IGNORECASE):
return category
return None
def filter(
self, text: str, context: dict
) -> tuple[bool, str, Optional[dict]]:
"""
Return (is_acceptable, filtered_text, violation_report).
"""
is_safe, confidence = self.check_nsfw(text)
if not is_safe and confidence > 0.95:
return False, "[Content filtered: safety violation]", {
'reason': 'nsfw', 'confidence': confidence
}
pii_matches = self.detect_pii(text)
if pii_matches and context.get('allow_pii', False) is False:
filtered = self._redact_pii(text, pii_matches)
return True, filtered, {'pii_redacted': len(pii_matches)}
blocklist_violation = self.check_blocklist(text)
if blocklist_violation:
return False, "[Content filtered: policy violation]", {
'reason': blocklist_violation
}
return True, text, None
Local verification checkpoint
Run the smallest example from this chapter in a local workspace and record the package version, runtime, data path, and observed output. If the result depends on model size, vector count, CPU/GPU backend, or available memory, note that constraint beside the exercise so the lesson remains reproducible.
EXERCISE
Build an output filtering system that processes a batch of model responses, flags violations across multiple categories, and produces a structured audit report.