# ═══════════════════════════════════════════════════════════ # PC 1 — GPU Inference Node (vLLM only) # ═══════════════════════════════════════════════════════════ # Run on the PC with the GPU. Exposes vLLM on port 8001. # PC 2 connects to this PC's IP:8001 for AI inference. # # Usage: # docker compose -f docker-compose.pc1-gpu.yml up -d # ═══════════════════════════════════════════════════════════ services: vllm-speed: image: vllm/vllm-openai:latest container_name: mac-vllm-speed ports: - "8001:8001" environment: - HF_HOME=/root/.cache/huggingface volumes: - hf-cache:/root/.cache/huggingface command: > --model Qwen/Qwen2.5-7B-Instruct-AWQ --port 8001 --gpu-memory-utilization 0.90 --max-model-len 8192 --trust-remote-code --enforce-eager deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] restart: unless-stopped volumes: hf-cache: