De-identification of PHI — Healthcare AI with Local Models (Chapter 4)

De-identification permits healthcare data use outside direct care contexts without triggering full PHI protections. HIPAA recognizes two de-identification approaches: the Safe Harbor method, which removes 18 specified identifiers, and the Expert Determination method, which demonstrates statistical anonymization.

Local LLMs excel at automated de-identification because they process data on-premises—no exposure risk during processing. The challenge is achieving sufficient accuracy to satisfy compliance requirements without consuming clinical relevance.

# phi_deidentifier.py
import re
from dataclasses import dataclass
from enum import Enum
from typing import Optional

class PHICategory(Enum):
    NAME = "names"
    DATE = "dates"
    AGE = "ages"
    LOCATION = "locations"
    CONTACT = "contact_info"
    IDs = "identifiers"
    PROFESSION = "professions"
    EMAIL = "email_addresses"

@dataclass
class PHIAnnotation:
    category: PHICategory
    start_pos: int
    end_pos: int
    original_text: str
    replacement: str

class LocalPHIDeidentifier:
    """De-identify clinical text using local LLM with regex fallback."""
    
    # Safe Harbor identifiers with regex patterns
    PATTERNS = {
        PHICategory.NAME: [
            r'\b[A-Z][a-z]+\s+[A-Z][a-z]+\b',  # Simple names
            r'\bDr\.\s+[A-Z][a-z]+\s+[A-Z][a-z]+\b',  # Doctor names
        ],
        PHICategory.DATE: [
            r'\b\d{1,2}/\d{1,2}/\d{2,4}\b',
            r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b',
        ],
        PHICategory.AGE: [
            r'\b(?:age|aged|yo|y\.?o\.?)\s*:?\s*\d{1,3}\b',
            r'\b\d{1,3}\s*(?:year|month|day|week)\s*(?:old)?\b',
        ],
        PHICategory.CONTACT: [
            r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',  # Phone
            r'\b\d{1,5}\s+\w+\s+(?:Street|St|Avenue|Ave|Road|Rd|Dr)\b',  # Address
        ],
        PHICategory.EMAIL: [
            r'\b[\w.-]+@[\w.-]+\.\w+\b',
        ],
        PHICategory.IDs: [
            r'\b(?:MRN|SSN|SS#|medical record|chart)\s*:?\s*#?\s*[A-Z0-9-]+\b',
            r'\b[A-Z]{2}\d{6,10}\b',  # Common ID patterns
        ],
    }
    
    def __init__(self, ollama_client):
        self.ollama = ollama_client
        self.regex_deidentifier = self._build_regex_deidentifier()
    
    def deidentify(self, text: str, method: str = "safe_harbor") -> tuple[str, list[PHIAnnotation]]:
        """Remove PHI from clinical text."""
        annotations = []
        
        # Primary: regex-based identifier removal
        clean_text = text
        for category, patterns in self.PATTERNS.items():
            for pattern in patterns:
                matches = re.finditer(pattern, clean_text, re.IGNORECASE)
                for match in matches:
                    annotation = PHIAnnotation(
                        category=category,
                        start_pos=match.start(),
                        end_pos=match.end(),
                        original_text=match.group(),
                        replacement=self._get_replacement(category)
                    )
                    annotations.append(annotation)
                    clean_text = clean_text[:match.start()] + annotation.replacement + clean_text[match.end():]
        
        # Secondary: LLM-based contextual de-identification
        # Use local model for remaining PHI that regex might miss
        prompt = f"""Identify and replace any remaining PHI in this text.
        Replace with [REDACTED-{category.upper()}] where category is the type.
        Only modify actual PHI, preserve clinical content.
        
        Text: {clean_text}"""
        
        result = self.ollama.generate(prompt)
        # Parse redaction markers from result
        
        return clean_text, annotations
    
    def _get_replacement(self, category: PHICategory) -> str:
        replacements = {
            PHICategory.NAME: "[REDACTED-NAME]",
            PHICategory.DATE: "[REDACTED-DATE]",
            PHICategory.AGE: "[REDACTED-AGE]",
            PHICategory.LOCATION: "[REDACTED-LOCATION]",
            PHICategory.CONTACT: "[REDACTED-CONTACT]",
            PHICategory.EMAIL: "[REDACTED-EMAIL]",
            PHICategory.IDs: "[REDACTED-ID]",
            PHICategory.PROFESSION: "[REDACTED-PROFESSION]",
        }
        return replacements.get(category, "[REDACTED]")

A critical failure mode: context-dependent names escape simple regex patterns. "The patient reported her husband experienced similar symptoms" contains a name reference that a regex won't catch. LLM-based secondary processing catches these cases, but requires careful prompt engineering to avoid over-redaction that removes clinical content.