18. Medical Document Assistant Project
This chapter integrates previous concepts into a complete project: building a medical document assistant that processes clinical documents, extracts structured information, drafts clinical notes, and ensures HIPAA compliance throughout.
The system architecture separates concerns: document ingestion, PHI de-identification, clinical extraction, note drafting, and audit logging. Each component can be tested independently and scaled according to workload.
# medical_document_assistant/
# ├── config.py
# ├── models/
# │ ├── __init__.py
# │ ├── document_processor.py
# │ ├── phi_deidentifier.py
# │ ├── clinical_extractor.py
# │ ├── note_drafter.py
# │ └── audit_logger.py
# ├── api/
# │ ├── __init__.py
# │ ├── routes.py
# │ └── middleware.py
# └── tests/
# ├── test_processor.py
# ├── test_deidentifier.py
# └── test_integration.py
# config.py
from dataclasses import dataclass
from typing import List
@dataclass
class AppConfig:
ollama_base_url: str = "http://localhost:11434"
default_model: str = "llama3.2"
vision_model: str = "llama3.2-vision"
# HIPAA compliance settings
audit_log_path: str = "/var/log/healthcare-ai/audit.log"
phi_retention_days: int = 0 # Don't retain PHI beyond processing
# Processing settings
max_concurrent_documents: int = 4
deidentification_confidence_threshold: float = 0.95
note_draft_review_required: bool = True
# Allowed file types
allowed_extensions: List[str] = [".txt", ".pdf", ".docx"]
# Redaction categories
redaction_categories: List[str] = [
"names", "dates", "ages", "locations",
"contact_info", "identifiers", "professions"
]
# models/phi_deidentifier.py
from typing import List, Tuple
import re
class PHIDeidentifier:
"""De-identify PHI from clinical documents."""
PATTERNS = {
"names": [
r'\b[A-Z][a-z]+\s+[A-Z][a-z]+\b',
r'\bDr\.\s+[A-Z][a-z]+\b',
],
"dates": [
r'\b\d{1,2}/\d{1,2}/\d{2,4}\b',
r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4}\b',
],
"ages": [
r'\b(?:age|yo|y\.o\.)\s*:?\s*\d{1,3}\b',
],
"identifiers": [
r'\b(?:MRN|SSN|medical record|chart)\s*:?\s*#?\s*[A-Z0-9-]+\b',
r'\b[A-Z]{2}\d{6,10}\b',
],
"contact_info": [
r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
r'\b[\w.-]+@[\w.-]+\.\w+\b',
]
}
def __init__(self, llm_client=None):
self.llm = llm_client
def deidentify(self, text: str, method: str = "safe_harbor") -> Tuple[str, List[dict]]:
"""Remove PHI from text, return cleaned text and annotations."""
annotations = []
cleaned = text
for category, patterns in self.PATTERNS.items():
for pattern in patterns:
for match in re.finditer(pattern, cleaned, re.IGNORECASE):
annotation = {
"category": category,
"start": match.start(),
"end": match.end(),
"original": match.group(),
"replacement": f"[{category.upper()}]"
}
annotations.append(annotation)
cleaned = cleaned[:match.start()] + annotation["replacement"] + cleaned[match.end():]
return cleaned, annotations
# models/clinical_extractor.py
import json
class ClinicalExtractor:
"""Extract structured clinical data from documents."""
EXTRACTION_SCHEMA = {
"chief_complaint": "str",
"history_of_present_illness": "str",
"diagnoses": "list[str]",
"medications": "list[dict{name, dosage, frequency}]",
"allergies": "list[str]",
"procedures_performed": "list[str]",
"plan": "str"
}
def __init__(self, llm_client):
self.llm = llm_client
def extract(self, document: str) -> dict:
"""Extract structured clinical data from document."""
prompt = f"""Extract structured clinical information from this document.
Follow this schema exactly:
{json.dumps(self.EXTRACTION_SCHEMA, indent=2)}
Document:
{document}
Return valid JSON matching the schema.
Use empty arrays/strings for missing fields.
Do not infer or guess - only extract explicit information."""
response = self.llm.generate(prompt)
try:
return json.loads(response)
except json.JSONDecodeError:
return {"error": "Failed to parse extraction", "raw_response": response}
# models/note_drafter.py
class ClinicalNoteDrafter:
"""Draft clinical notes from extracted data."""
SOAP_SECTIONS = ["subjective", "objective", "assessment", "plan"]
def __init__(self, llm_client):
self.llm = llm_client
def draft_soap_note(self, extracted_data: dict) -> dict:
"""Generate SOAP note from extracted clinical data."""
prompt = f"""Generate a SOAP note from this clinical data.
Extracted Data:
{json.dumps(extracted_data, indent=2)}
Format output as:
S: [Subjective - patient complaints, history]
O: [Objective - findings, vitals, exam results]
A: [Assessment - diagnoses, clinical impressions]
P: [Plan - treatment plan, follow-up, referrals]
Use professional clinical language.
Do not include any information not supported by the extracted data.
Include [DRAFT] marker at start of note."""
response = self.llm.generate(prompt)
return self._parse_soap_response(response)
def _parse_soap_response(self, response: str) -> dict:
"""Parse LLM response into SOAP section dict."""
sections = {section: "" for section in self.SOAP_SECTIONS}
current_section = None
for line in response.split("\n"):
line = line.strip()
for section in self.SOAP_SECTIONS:
if line.startswith(f"{section[0].upper()}:"):
current_section = section
line = line[2:].strip()
break
if current_section and line:
sections[current_section] += line + "\n"
return {k: v.strip() for k, v in sections.items()}
# models/audit_logger.py
import logging
from datetime import datetime
from pathlib import Path
class PHIAuditLogger:
"""HIPAA-compliant audit logging."""
def __init__(self, log_path: str):
self.log_path = Path(log_path)
self.log_path.parent.mkdir(parents=True, exist_ok=True)
self.logger = logging.getLogger("phi_audit")
self.logger.setLevel(logging.INFO)
handler = logging.FileHandler(self.log_path)
handler.setFormatter(logging.Formatter(
'%(asctime)s - %(levelname)s - %(message)s'
))
self.logger.addHandler(handler)
def log_document_processed(self, document_id: str,
processing_time: float,
deidentification_applied: bool):
"""Log document processing event."""
self.logger.info(f"DOCUMENT_PROCESSED|{document_id}|{processing_time}|deidentification={deidentification_applied}")
def log_phi_encountered(self, document_id: str,
phi_categories: list):
"""Log PHI categories encountered in document."""
self.logger.info(f"PHI_ENCOUNTERED|{document_id}|{','.join(phi_categories)}")
def log_note_drafted(self, document_id: str,
draft_id: str, requires_review: bool):
"""Log note drafting event."""
self.logger.info(f"NOTE_DRAFTED|{document_id}|{draft_id}|review_required={requires_review}")
def log_access(self, user_id: str, document_id: str, action: str):
"""Log document access."""
self.logger.info(f"ACCESS|{user_id}|{document_id}|{action}")
The complete system requires integration testing, security hardening, and clinical validation before production deployment. This chapter provides the foundation; production deployment requires additional work on error handling, monitoring, and incident response.
Implement the complete medical document assistant as a working system. Deploy locally, process 20 test documents (synthesized or de-identified real documents), and verify: (1) PHI is correctly identified and removed, (2) extraction produces valid JSON matching the schema, (3) SOAP notes are professionally formatted, (4) audit logs capture all operations.