Medical Document Assistant Project — Healthcare AI with Local Models (Chapter 18)

This chapter integrates previous concepts into a complete project: building a medical document assistant that processes clinical documents, extracts structured information, drafts clinical notes, and ensures HIPAA compliance throughout.

The system architecture separates concerns: document ingestion, PHI de-identification, clinical extraction, note drafting, and audit logging. Each component can be tested independently and scaled according to workload.

# medical_document_assistant/
# ├── config.py
# ├── models/
# │   ├── __init__.py
# │   ├── document_processor.py
# │   ├── phi_deidentifier.py
# │   ├── clinical_extractor.py
# │   ├── note_drafter.py
# │   └── audit_logger.py
# ├── api/
# │   ├── __init__.py
# │   ├── routes.py
# │   └── middleware.py
# └── tests/
#     ├── test_processor.py
#     ├── test_deidentifier.py
#     └── test_integration.py

# config.py
from dataclasses import dataclass
from typing import List

@dataclass
class AppConfig:
    ollama_base_url: str = "http://localhost:11434"
    default_model: str = "llama3.2"
    vision_model: str = "llama3.2-vision"
    
    # HIPAA compliance settings
    audit_log_path: str = "/var/log/healthcare-ai/audit.log"
    phi_retention_days: int = 0  # Don't retain PHI beyond processing
    
    # Processing settings
    max_concurrent_documents: int = 4
    deidentification_confidence_threshold: float = 0.95
    note_draft_review_required: bool = True
    
    # Allowed file types
    allowed_extensions: List[str] = [".txt", ".pdf", ".docx"]
    
    # Redaction categories
    redaction_categories: List[str] = [
        "names", "dates", "ages", "locations", 
        "contact_info", "identifiers", "professions"
    ]

# models/phi_deidentifier.py
from typing import List, Tuple
import re

class PHIDeidentifier:
    """De-identify PHI from clinical documents."""
    
    PATTERNS = {
        "names": [
            r'\b[A-Z][a-z]+\s+[A-Z][a-z]+\b',
            r'\bDr\.\s+[A-Z][a-z]+\b',
        ],
        "dates": [
            r'\b\d{1,2}/\d{1,2}/\d{2,4}\b',
            r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4}\b',
        ],
        "ages": [
            r'\b(?:age|yo|y\.o\.)\s*:?\s*\d{1,3}\b',
        ],
        "identifiers": [
            r'\b(?:MRN|SSN|medical record|chart)\s*:?\s*#?\s*[A-Z0-9-]+\b',
            r'\b[A-Z]{2}\d{6,10}\b',
        ],
        "contact_info": [
            r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
            r'\b[\w.-]+@[\w.-]+\.\w+\b',
        ]
    }
    
    def __init__(self, llm_client=None):
        self.llm = llm_client
        
    def deidentify(self, text: str, method: str = "safe_harbor") -> Tuple[str, List[dict]]:
        """Remove PHI from text, return cleaned text and annotations."""
        annotations = []
        cleaned = text
        
        for category, patterns in self.PATTERNS.items():
            for pattern in patterns:
                for match in re.finditer(pattern, cleaned, re.IGNORECASE):
                    annotation = {
                        "category": category,
                        "start": match.start(),
                        "end": match.end(),
                        "original": match.group(),
                        "replacement": f"[{category.upper()}]"
                    }
                    annotations.append(annotation)
                    cleaned = cleaned[:match.start()] + annotation["replacement"] + cleaned[match.end():]
        
        return cleaned, annotations

# models/clinical_extractor.py
import json

class ClinicalExtractor:
    """Extract structured clinical data from documents."""
    
    EXTRACTION_SCHEMA = {
        "chief_complaint": "str",
        "history_of_present_illness": "str",
        "diagnoses": "list[str]",
        "medications": "list[dict{name, dosage, frequency}]",
        "allergies": "list[str]",
        "procedures_performed": "list[str]",
        "plan": "str"
    }
    
    def __init__(self, llm_client):
        self.llm = llm_client
        
    def extract(self, document: str) -> dict:
        """Extract structured clinical data from document."""
        
        prompt = f"""Extract structured clinical information from this document.
        Follow this schema exactly:
        {json.dumps(self.EXTRACTION_SCHEMA, indent=2)}
        
        Document:
        {document}
        
        Return valid JSON matching the schema.
        Use empty arrays/strings for missing fields.
        Do not infer or guess - only extract explicit information."""
        
        response = self.llm.generate(prompt)
        
        try:
            return json.loads(response)
        except json.JSONDecodeError:
            return {"error": "Failed to parse extraction", "raw_response": response}

# models/note_drafter.py

class ClinicalNoteDrafter:
    """Draft clinical notes from extracted data."""
    
    SOAP_SECTIONS = ["subjective", "objective", "assessment", "plan"]
    
    def __init__(self, llm_client):
        self.llm = llm_client
        
    def draft_soap_note(self, extracted_data: dict) -> dict:
        """Generate SOAP note from extracted clinical data."""
        
        prompt = f"""Generate a SOAP note from this clinical data.
        
        Extracted Data:
        {json.dumps(extracted_data, indent=2)}
        
        Format output as:
        S: [Subjective - patient complaints, history]
        O: [Objective - findings, vitals, exam results]
        A: [Assessment - diagnoses, clinical impressions]
        P: [Plan - treatment plan, follow-up, referrals]
        
        Use professional clinical language.
        Do not include any information not supported by the extracted data.
        Include [DRAFT] marker at start of note."""
        
        response = self.llm.generate(prompt)
        
        return self._parse_soap_response(response)
    
    def _parse_soap_response(self, response: str) -> dict:
        """Parse LLM response into SOAP section dict."""
        sections = {section: "" for section in self.SOAP_SECTIONS}
        
        current_section = None
        for line in response.split("\n"):
            line = line.strip()
            for section in self.SOAP_SECTIONS:
                if line.startswith(f"{section[0].upper()}:"):
                    current_section = section
                    line = line[2:].strip()
                    break
            
            if current_section and line:
                sections[current_section] += line + "\n"
        
        return {k: v.strip() for k, v in sections.items()}

# models/audit_logger.py
import logging
from datetime import datetime
from pathlib import Path

class PHIAuditLogger:
    """HIPAA-compliant audit logging."""
    
    def __init__(self, log_path: str):
        self.log_path = Path(log_path)
        self.log_path.parent.mkdir(parents=True, exist_ok=True)
        
        self.logger = logging.getLogger("phi_audit")
        self.logger.setLevel(logging.INFO)
        
        handler = logging.FileHandler(self.log_path)
        handler.setFormatter(logging.Formatter(
            '%(asctime)s - %(levelname)s - %(message)s'
        ))
        self.logger.addHandler(handler)
    
    def log_document_processed(self, document_id: str, 
                                processing_time: float,
                                deidentification_applied: bool):
        """Log document processing event."""
        self.logger.info(f"DOCUMENT_PROCESSED|{document_id}|{processing_time}|deidentification={deidentification_applied}")
    
    def log_phi_encountered(self, document_id: str, 
                             phi_categories: list):
        """Log PHI categories encountered in document."""
        self.logger.info(f"PHI_ENCOUNTERED|{document_id}|{','.join(phi_categories)}")
    
    def log_note_drafted(self, document_id: str, 
                         draft_id: str, requires_review: bool):
        """Log note drafting event."""
        self.logger.info(f"NOTE_DRAFTED|{document_id}|{draft_id}|review_required={requires_review}")
    
    def log_access(self, user_id: str, document_id: str, action: str):
        """Log document access."""
        self.logger.info(f"ACCESS|{user_id}|{document_id}|{action}")

The complete system requires integration testing, security hardening, and clinical validation before production deployment. This chapter provides the foundation; production deployment requires additional work on error handling, monitoring, and incident response.