16. Privacy-Preserving AI

Chapter 16 of 18 · 20 min

Beyond local deployment, additional privacy-preserving techniques offer stronger guarantees for sensitive healthcare data. These include federated learning, differential privacy, and secure multiparty computation—each providing different trade-offs between privacy protection and model utility.

Local models provide a foundation, but sophisticated attacks can still extract training data or model inputs from outputs. Privacy-preserving techniques address these residual risks.

# privacy_preserving.py
import numpy as np
from typing import List, Dict, Tuple
from dataclasses import dataclass

@dataclass
class PrivacyBudget:
    epsilon: float
    delta: float
    remaining_queries: int

class DifferentialPrivacy:
    """Apply differential privacy to model outputs."""
    
    def __init__(self, epsilon: float = 1.0, delta: float = 1e-5):
        self.epsilon = epsilon
        self.delta = delta
        self.sensitivity = 1.0  # Maximum change any record can cause
        
    def add_noise(self, value: float) -> float:
        """Add calibrated Laplace noise for differential privacy."""
        scale = self.sensitivity / self.epsilon
        noise = np.random.laplace(0, scale)
        return value + noise
    
    def add_noise_to_array(self, values: List[float]) -> List[float]:
        """Apply differential privacy to array of values."""
        return [self.add_noise(v) for v in values]
    
    def private_count(self, counts: Dict[str, int]) -> Dict[str, float]:
        """Return differentially private counts."""
        return {
            key: self.add_noise(count) 
            for key, count in counts.items()
        }

class FederatedLearning:
    """Implement federated learning for distributed model training."""
    
    def __init__(self, aggregation_server, local_clients: List):
        self.aggregation_server = aggregation_server
        self.local_clients = local_clients
        self.global_model = None
        
    def train_round(self, round_number: int) -> Dict:
        """Execute one federated learning round."""
        
        # Select subset of clients
        selected_clients = self._select_clients()
        
        # Distribute global model
        for client in selected_clients:
            client.receive_model(self.global_model)
        
        # Local training on each client
        client_updates = []
        for client in selected_clients:
            local_gradient = client.train_local(epochs=1)
            client_updates.append(local_gradient)
        
        # Aggregate updates (secure aggregation)
        aggregated_update = self._secure_aggregate(client_updates)
        
        # Apply update to global model
        self.global_model = self._apply_update(aggregated_update)
        
        return {
            "round": round_number,
            "clients_participating": len(selected_clients),
            "aggregation_verified": True
        }
    
    def _select_clients(self, fraction: float = 0.1) -> List:
        """Select random fraction of clients for round."""
        n_select = max(1, int(len(self.local_clients) * fraction))
        return np.random.choice(self.local_clients, n_select, replace=False)
    
    def _secure_aggregate(self, updates: List[Dict]) -> Dict:
        """Aggregate client updates without seeing individual updates."""
        # Simplified: real implementation uses cryptographic protocols
        aggregated = {}
        for key in updates[0].keys():
            aggregated[key] = np.mean([u[key] for u in updates], axis=0)
        return aggregated
    
    def _apply_update(self, gradient: Dict) -> 'Model':
        """Apply aggregated gradient to global model."""
        # Implementation applies gradient
        pass

class PrivacyPreservingInference:
    """Techniques for private inference on healthcare data."""
    
    def __init__(self, base_model, privacy_engine: DifferentialPrivacy):
        self.model = base_model
        self.dp = privacy_engine
        
    def private_completion(self, prompt: str, 
                           temperature: float = 0.7) -> str:
        """Generate completion with differential privacy applied."""
        
        # Standard completion
        raw_response = self.model.generate(prompt, temperature=temperature)
        
        # Note: True DP for text generation requires more sophisticated 
        # mechanisms; this is conceptual illustration
        
        return raw_response
    
    def aggregate_patient_statistics(self, 
                                     patient_records: List[Dict]) -> Dict:
        """Calculate statistics with privacy guarantees."""
        
        # Count conditions across patient population
        condition_counts = {}
        for record in patient_records:
            for condition in record.get("conditions", []):
                condition_counts[condition] = condition_counts.get(condition, 0) + 1
        
        # Apply differential privacy
        private_counts = self.dp.private_count(condition_counts)
        
        return {
            "condition_prevalence": private_counts,
            "total_patients": self.dp.add_noise(len(patient_records)),
            "privacy_guarantee": f"(ε={self.dp.epsilon}, δ={self.dp.delta})"
        }
    
    def secure_patient_matching(self, query: Dict,
                                patient_db: List[Dict],
                                k: int = 5) -> List[Dict]:
        """Find similar patients without exposing query or results."""
        
        # Generate query embedding
        query_embedding = self.model.embed(query["description"])
        
        # Add noise to query embedding (k-anonymity consideration)
        noisy_query = query_embedding + np.random.normal(0, 0.1, len(query_embedding))
        
        # Search with noisy embedding
        results = self._search_patients(noisy_query, patient_db, k)
        
        return results
    
    def _search_patients(self, embedding: np.ndarray,
                         patient_db: List[Dict],
                         k: int) -> List[Dict]:
        """Search for similar patients."""
        # Implementation
        return []

Federated learning enables model training across multiple institutions without data sharing—each institution trains locally and shares only model updates. The security assumption is that aggregated updates don't reveal individual training data, which holds for large enough client populations but can fail for small cohorts.

EXERCISE

Implement a simple federated learning simulation with three synthetic clients. Train a basic model across rounds and verify that individual client data never leaves the client. Measure model convergence.