15. Linux AI Workstation Project

Chapter 15 of 15 · 25 min

This chapter puts everything together into a working AI workstation that boots, runs inference, and is accessible and monitored.

System architecture:

Internet → UFW Firewall → SSH Key Auth → tmux session
                                          ↓
                                     Ollama service
                                     (systemd, GPU)
                                          ↓
                          Docker Compose stack
                          ├── llama.cpp server
                          ├── nginx (rate-limited)
                          └── prometheus sidecar
                                          ↓
                              node_exporter (GPU metrics)
                                          ↓
                              Grafana (dashboards)

Step 1: Base system

# Ubuntu 24.04 LTS, minimal install, SSH server
sudo apt update
sudo apt install openssh-server nfs-kernel-server
sudo systemctl enable --now ssh

# Install NVIDIA driver
sudo apt install nvidia-driver-535
sudo update-initramfs -u
sudo reboot

Step 2: CUDA and runtime

# CUDA 12.4 toolkit
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt update
sudo apt install cuda-toolkit-12-4
echo 'export PATH=/usr/local/cuda-12.4/bin:$PATH' >> ~/.bashrc
echo 'export LD_LIBRARY_PATH=/usr/local/cuda-12.4/lib64:$LD_LIBRARY_PATH' >> ~/.bashrc

# Docker + NVIDIA container toolkit
curl -fsSL https://get.docker.com | sh
distribution=$(. /etc/os-release && echo "$ID$VERSION_ID")
curl -s -L https://nvidia.github.io/nvidia-container-runtime/gpgkey | \
  sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
curl -s -L https://nvidia.github.io/nvidia-container-runtime/$distribution/nvidia-container-runtime.list | \
  sudo sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
  sudo tee /etc/apt/sources.list.d/nvidia-container-runtime.list
sudo apt update
sudo apt install nvidia-container-toolkit
sudo nvidia-ctk runtime configure --runtime=docker
sudo systemctl restart docker

Step 3: Kernel tuning

cat >> /etc/sysctl.conf << EOF
vm.overcommit_memory = 1
vm.overcommit_ratio = 95
vm.nr_hugepages = 128
EOF
sudo sysctl -p

Step 4: Ollama as systemd service

curl -fsSL https://ollama.ai/install.sh | sh
sudo useradd --system --no-create-home --shell /usr/sbin/nologin ollama
sudo mkdir -p /etc/systemd/system/ollama.service.d
sudo bash -c 'cat > /etc/systemd/system/ollama.service << EOF
[Unit]
Description=Ollama Service
After=network-online.target

[Service]
Type=simple
User=ollama
Group=ollama
ExecStart=/usr/local/bin/ollama serve
Restart=always
RestartSec=10

[Install]
WantedBy=multi-user.target
EOF'
sudo systemctl daemon-reload
sudo systemctl enable ollama
sudo systemctl start ollama

Step 5: Docker Compose AI stack

mkdir -p ~/ai-stack/{models,certs,prometheus}
cd ~/ai-stack

cat > docker-compose.yml << 'EOF'
services:
  llama-server:
    image: ghcr.io/ggerganov/llama.cpp:server
    runtime: nvidia
    volumes:
      - ./models:/models:ro
    command: >
      ./server
      -m /models/mistral-7b-q4_k_m.gguf
      -ngl 99
      -c 8192
      --host 0.0.0.0
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8080/"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s

  nginx:
    image: nginx:alpine
    ports:
      - "80:80"
    volumes:
      - ./nginx.conf:/etc/nginx/nginx.conf:ro
    depends_on:
      llama-server:
        condition: service_healthy
    restart: unless-stopped

  node-exporter:
    image: prom/node-exporter
    network_mode: host
    command:
      - '--collector.disable-defaults'
      - '--collector.cuda'
    restart: unless-stopped

  prometheus:
    image: prom/prometheus:latest
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - prometheus-data:/prometheus
    ports:
      - "9090:9090"
    restart: unless-stopped

  grafana:
    image: grafana/grafana
    ports:
      - "3000:3000"
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=change-me
    volumes:
      - grafana-data:/var/lib/grafana
    restart: unless-stopped

volumes:
  prometheus-data:
  grafana-data:
EOF

cat > prometheus.yml << 'EOF'
global:
  scrape_interval: 15s
scrape_configs:
  - job_name: 'llama-inference'
    static_configs:
      - targets: ['llama-server:8080']
  - job_name: 'node'
    static_configs:
      - targets: ['localhost:9100']
EOF

cat > nginx.conf << 'EOF'
events { worker_connections 1024; }
http {
  limit_req_zone $binary_remote_addr zone=ai_limit:10m rate=10r/s;
  upstream llama_backend { server llama-server:8080; }
  server {
    listen 80;
    location / {
      limit_req zone=ai_limit burst=20 nodelay;
      proxy_pass http://llama_backend;
      proxy_http_version 1.1;
    }
  }
}
EOF

docker compose up -d

Step 6: Security hardening

sudo apt install fail2ban ufw
sudo ufw default deny incoming
sudo ufw allow ssh
sudo ufw allow 80/tcp
sudo ufw allow 443/tcp
sudo ufw allow 3000/tcp  # Grafana, restrict to your IP range
sudo ufw allow 9090/tcp  # Prometheus, restrict to your IP range
sudo ufw enable

sudo cp /etc/fail2ban/jail.conf /etc/fail2ban/jail.local
# Add ignoreip for your IP range
sudo systemctl enable --now fail2ban

Step 7: Verification

# GPU accessible
nvidia-smi

# Ollama running
curl http://localhost:11434/api/tags

# Docker stack healthy
docker compose ps

# GPU metrics accessible
curl http://localhost:9100/metrics | grep nvidia

# Prometheus targets
curl http://localhost:9090/api/v1/targets | jq '.data.activeTargets'

# Grafana accessible
curl -s -o /dev/null -w "%{http_code}" http://localhost:3000/login

# End-to-end inference test
curl -X POST http://localhost/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{"model":"mistral","messages":[{"role":"user","content":"What is 2+2?"}]}'

Failure mode: docker compose up fails with could not select runtime: nvidia. The nvidia-container-toolkit was not configured for Docker. Run sudo nvidia-ctk runtime configure --runtime=docker and sudo systemctl restart docker again.

Failure mode: nginx returns 502 on first request. The llama.cpp server health check passed but the first HTTP request to the upstream takes extra time. Add proxy_connect_timeout 60s; to the nginx location block.

Failure mode: Prometheus has no targets. Check docker compose logs prometheus. The localhost hostname inside the Prometheus container does not resolve to the host's localhost—it resolves to the container's own localhost. Use the service name instead: http://llama-server:8080/metrics or --network host for the Prometheus container.

EXERCISE

Build the complete system described in this chapter on a fresh Ubuntu 24.04 installation with a dedicated GPU. Verify every step from nvidia-smi to a successful end-to-end API call through nginx, then observe GPU metrics in Grafana.