16. Privacy-Preserving AI
Beyond local deployment, additional privacy-preserving techniques offer stronger guarantees for sensitive healthcare data. These include federated learning, differential privacy, and secure multiparty computation—each providing different trade-offs between privacy protection and model utility.
Local models provide a foundation, but sophisticated attacks can still extract training data or model inputs from outputs. Privacy-preserving techniques address these residual risks.
# privacy_preserving.py
import numpy as np
from typing import List, Dict, Tuple
from dataclasses import dataclass
@dataclass
class PrivacyBudget:
epsilon: float
delta: float
remaining_queries: int
class DifferentialPrivacy:
"""Apply differential privacy to model outputs."""
def __init__(self, epsilon: float = 1.0, delta: float = 1e-5):
self.epsilon = epsilon
self.delta = delta
self.sensitivity = 1.0 # Maximum change any record can cause
def add_noise(self, value: float) -> float:
"""Add calibrated Laplace noise for differential privacy."""
scale = self.sensitivity / self.epsilon
noise = np.random.laplace(0, scale)
return value + noise
def add_noise_to_array(self, values: List[float]) -> List[float]:
"""Apply differential privacy to array of values."""
return [self.add_noise(v) for v in values]
def private_count(self, counts: Dict[str, int]) -> Dict[str, float]:
"""Return differentially private counts."""
return {
key: self.add_noise(count)
for key, count in counts.items()
}
class FederatedLearning:
"""Implement federated learning for distributed model training."""
def __init__(self, aggregation_server, local_clients: List):
self.aggregation_server = aggregation_server
self.local_clients = local_clients
self.global_model = None
def train_round(self, round_number: int) -> Dict:
"""Execute one federated learning round."""
# Select subset of clients
selected_clients = self._select_clients()
# Distribute global model
for client in selected_clients:
client.receive_model(self.global_model)
# Local training on each client
client_updates = []
for client in selected_clients:
local_gradient = client.train_local(epochs=1)
client_updates.append(local_gradient)
# Aggregate updates (secure aggregation)
aggregated_update = self._secure_aggregate(client_updates)
# Apply update to global model
self.global_model = self._apply_update(aggregated_update)
return {
"round": round_number,
"clients_participating": len(selected_clients),
"aggregation_verified": True
}
def _select_clients(self, fraction: float = 0.1) -> List:
"""Select random fraction of clients for round."""
n_select = max(1, int(len(self.local_clients) * fraction))
return np.random.choice(self.local_clients, n_select, replace=False)
def _secure_aggregate(self, updates: List[Dict]) -> Dict:
"""Aggregate client updates without seeing individual updates."""
# Simplified: real implementation uses cryptographic protocols
aggregated = {}
for key in updates[0].keys():
aggregated[key] = np.mean([u[key] for u in updates], axis=0)
return aggregated
def _apply_update(self, gradient: Dict) -> 'Model':
"""Apply aggregated gradient to global model."""
# Implementation applies gradient
pass
class PrivacyPreservingInference:
"""Techniques for private inference on healthcare data."""
def __init__(self, base_model, privacy_engine: DifferentialPrivacy):
self.model = base_model
self.dp = privacy_engine
def private_completion(self, prompt: str,
temperature: float = 0.7) -> str:
"""Generate completion with differential privacy applied."""
# Standard completion
raw_response = self.model.generate(prompt, temperature=temperature)
# Note: True DP for text generation requires more sophisticated
# mechanisms; this is conceptual illustration
return raw_response
def aggregate_patient_statistics(self,
patient_records: List[Dict]) -> Dict:
"""Calculate statistics with privacy guarantees."""
# Count conditions across patient population
condition_counts = {}
for record in patient_records:
for condition in record.get("conditions", []):
condition_counts[condition] = condition_counts.get(condition, 0) + 1
# Apply differential privacy
private_counts = self.dp.private_count(condition_counts)
return {
"condition_prevalence": private_counts,
"total_patients": self.dp.add_noise(len(patient_records)),
"privacy_guarantee": f"(ε={self.dp.epsilon}, δ={self.dp.delta})"
}
def secure_patient_matching(self, query: Dict,
patient_db: List[Dict],
k: int = 5) -> List[Dict]:
"""Find similar patients without exposing query or results."""
# Generate query embedding
query_embedding = self.model.embed(query["description"])
# Add noise to query embedding (k-anonymity consideration)
noisy_query = query_embedding + np.random.normal(0, 0.1, len(query_embedding))
# Search with noisy embedding
results = self._search_patients(noisy_query, patient_db, k)
return results
def _search_patients(self, embedding: np.ndarray,
patient_db: List[Dict],
k: int) -> List[Dict]:
"""Search for similar patients."""
# Implementation
return []
Federated learning enables model training across multiple institutions without data sharing—each institution trains locally and shares only model updates. The security assumption is that aggregated updates don't reveal individual training data, which holds for large enough client populations but can fail for small cohorts.
Implement a simple federated learning simulation with three synthetic clients. Train a basic model across rounds and verify that individual client data never leaves the client. Measure model convergence.