Docker Deployment — Voice AI with Local Models (Chapter 20)

Containerization ensures consistent voice AI deployments across development, testing, and production environments with all dependencies isolated.

Dockerfile for Voice Pipeline

FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04

# Prevent interactive prompts during package installation
ENV DEBIAN_FRONTEND=noninteractive

# Install system dependencies
RUN apt-get update && apt-get install -y \
    libsndfile1 \
    libsndfile1-dev \
    ffmpeg \
    libportaudio2 \
    libportaudio-dev \
    python3.10 \
    python3-pip \
    python3-dev \
    && rm -rf /var/lib/apt/lists/*

# Set Python defaults
RUN ln -sf /usr/bin/python3 /usr/bin/python

# Create virtual environment
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"

# Install Python dependencies
COPY requirements.txt /tmp/requirements.txt
RUN pip install --no-cache-dir -r /tmp/requirements.txt

# Copy application code
COPY app/ /app/
WORKDIR /app

# Pre-download models (reduces cold start)
RUN python -c "from transformers import AutoModel; AutoModel.from_pretrained('small')"

# Environment variables
ENV PYTHONUNBUFFERED=1
ENV CUDA_VISIBLE_DEVICES=0

# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
    CMD python healthcheck.py

# Run the application
CMD ["python", "main.py"]

requirements.txt

# Core ML
torch>=2.0.0
transformers>=4.30.0
accelerate>=0.20.0

# Voice processing
openai-whisper>=20231117
CoquiTTS>=0.21.0
silero-vad>=4.0

# Audio I/O
soundfile>=0.12.1
pyaudio>=0.2.14
librosa>=0.10.0

# Web serving
fastapi>=0.100.0
uvicorn[standard]>=0.23.0
websockets>=11.0

# Infrastructure
redis>=4.5.0
prometheus-client>=0.17.0

docker-compose.yml

version: '3.8'

services:
  voice-pipeline:
    build: .
    ports:
      - "8000:8000"
      - "5004:5004"  # Metrics
    volumes:
      - ./models:/app/models  # Cache models
      - ./audio:/app/audio
    environment:
      - MODEL_CACHE_DIR=/app/models
      - REDIS_URL=redis://cache:6379
      - GPU_ENABLED=true
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    healthcheck:
      test: ["CMD", "python", "healthcheck.py"]
      interval: 30s
      timeout: 10s
      retries: 3

  cache:
    image: redis:7-alpine
    volumes:
      - redis-data:/data
    command: redis-server --maxmemory 256mb --maxmemory-policy allkeys-lru

  metrics:
    image: prom/prometheus:latest
    ports:
      - "9090:9090"
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
    depends_on:
      - voice-pipeline

volumes:
  redis-data:
  models:
  audio:

Multi-Stage Build for Smaller Images

# Stage 1: Build stage
FROM python:3.10-slim as builder

RUN apt-get update && apt-get install -y gcc
COPY requirements.txt .
RUN pip install --user -r requirements.txt

# Stage 2: Production stage
FROM python:3.10-slim

COPY --from=builder /root/.local /root/.local
ENV PATH=/root/.local/bin:$PATH

# Remove unnecessary packages
RUN apt-get purge -y gcc && rm -rf /var/lib/apt/lists/*

COPY app/ /app/
WORKDIR /app

CMD ["python", "main.py"]

Healthcheck Implementation

# healthcheck.py
import sys
import httpx

def main():
    try:
        # Check API health
        response = httpx.get("http://localhost:8000/health", timeout=5)
        
        if response.status_code == 200:
            data = response.json()
            if data.get("status") == "healthy":
                sys.exit(0)
        
        sys.exit(1)
    except Exception:
        sys.exit(1)

if __name__ == "__main__":
    main()

GPU Docker Configuration

# docker-compose.gpu.yml
services:
  voice-pipeline:
    runtime: nvidia
    environment:
      - NVIDIA_VISIBLE_DEVICES=0
      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu, compute]

Deployment Commands

# Build image
docker build -t voice-pipeline:latest .

# Run with GPU
docker run --gpus all --rm -p 8000:8000 voice-pipeline:latest

# Run with docker-compose
docker-compose -f docker-compose.yml up -d

# View logs
docker-compose logs -f voice-pipeline

# Scale horizontally (multiple instances)
docker-compose up -d --scale voice-pipeline=3