04. De-identification of PHI
De-identification permits healthcare data use outside direct care contexts without triggering full PHI protections. HIPAA recognizes two de-identification approaches: the Safe Harbor method, which removes 18 specified identifiers, and the Expert Determination method, which demonstrates statistical anonymization.
Local LLMs excel at automated de-identification because they process data on-premises—no exposure risk during processing. The challenge is achieving sufficient accuracy to satisfy compliance requirements without consuming clinical relevance.
# phi_deidentifier.py
import re
from dataclasses import dataclass
from enum import Enum
from typing import Optional
class PHICategory(Enum):
NAME = "names"
DATE = "dates"
AGE = "ages"
LOCATION = "locations"
CONTACT = "contact_info"
IDs = "identifiers"
PROFESSION = "professions"
EMAIL = "email_addresses"
@dataclass
class PHIAnnotation:
category: PHICategory
start_pos: int
end_pos: int
original_text: str
replacement: str
class LocalPHIDeidentifier:
"""De-identify clinical text using local LLM with regex fallback."""
# Safe Harbor identifiers with regex patterns
PATTERNS = {
PHICategory.NAME: [
r'\b[A-Z][a-z]+\s+[A-Z][a-z]+\b', # Simple names
r'\bDr\.\s+[A-Z][a-z]+\s+[A-Z][a-z]+\b', # Doctor names
],
PHICategory.DATE: [
r'\b\d{1,2}/\d{1,2}/\d{2,4}\b',
r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b',
],
PHICategory.AGE: [
r'\b(?:age|aged|yo|y\.?o\.?)\s*:?\s*\d{1,3}\b',
r'\b\d{1,3}\s*(?:year|month|day|week)\s*(?:old)?\b',
],
PHICategory.CONTACT: [
r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', # Phone
r'\b\d{1,5}\s+\w+\s+(?:Street|St|Avenue|Ave|Road|Rd|Dr)\b', # Address
],
PHICategory.EMAIL: [
r'\b[\w.-]+@[\w.-]+\.\w+\b',
],
PHICategory.IDs: [
r'\b(?:MRN|SSN|SS#|medical record|chart)\s*:?\s*#?\s*[A-Z0-9-]+\b',
r'\b[A-Z]{2}\d{6,10}\b', # Common ID patterns
],
}
def __init__(self, ollama_client):
self.ollama = ollama_client
self.regex_deidentifier = self._build_regex_deidentifier()
def deidentify(self, text: str, method: str = "safe_harbor") -> tuple[str, list[PHIAnnotation]]:
"""Remove PHI from clinical text."""
annotations = []
# Primary: regex-based identifier removal
clean_text = text
for category, patterns in self.PATTERNS.items():
for pattern in patterns:
matches = re.finditer(pattern, clean_text, re.IGNORECASE)
for match in matches:
annotation = PHIAnnotation(
category=category,
start_pos=match.start(),
end_pos=match.end(),
original_text=match.group(),
replacement=self._get_replacement(category)
)
annotations.append(annotation)
clean_text = clean_text[:match.start()] + annotation.replacement + clean_text[match.end():]
# Secondary: LLM-based contextual de-identification
# Use local model for remaining PHI that regex might miss
prompt = f"""Identify and replace any remaining PHI in this text.
Replace with [REDACTED-{category.upper()}] where category is the type.
Only modify actual PHI, preserve clinical content.
Text: {clean_text}"""
result = self.ollama.generate(prompt)
# Parse redaction markers from result
return clean_text, annotations
def _get_replacement(self, category: PHICategory) -> str:
replacements = {
PHICategory.NAME: "[REDACTED-NAME]",
PHICategory.DATE: "[REDACTED-DATE]",
PHICategory.AGE: "[REDACTED-AGE]",
PHICategory.LOCATION: "[REDACTED-LOCATION]",
PHICategory.CONTACT: "[REDACTED-CONTACT]",
PHICategory.EMAIL: "[REDACTED-EMAIL]",
PHICategory.IDs: "[REDACTED-ID]",
PHICategory.PROFESSION: "[REDACTED-PROFESSION]",
}
return replacements.get(category, "[REDACTED]")
A critical failure mode: context-dependent names escape simple regex patterns. "The patient reported her husband experienced similar symptoms" contains a name reference that a regex won't catch. LLM-based secondary processing catches these cases, but requires careful prompt engineering to avoid over-redaction that removes clinical content.
Take a sample clinical note and run it through both regex and LLM de-identification. Compare results and identify where each approach fails. Measure false positive and false negative rates manually.