| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # PC 1 β GPU Inference Node (vLLM only) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Run on the PC with the GPU. Exposes vLLM on port 8001. | |
| # PC 2 connects to this PC's IP:8001 for AI inference. | |
| # | |
| # Usage: | |
| # docker compose -f docker-compose.pc1-gpu.yml up -d | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| services: | |
| vllm-speed: | |
| image: vllm/vllm-openai:latest | |
| container_name: mac-vllm-speed | |
| ports: | |
| - "8001:8001" | |
| environment: | |
| - HF_HOME=/root/.cache/huggingface | |
| volumes: | |
| - hf-cache:/root/.cache/huggingface | |
| command: > | |
| --model Qwen/Qwen2.5-7B-Instruct-AWQ | |
| --port 8001 | |
| --gpu-memory-utilization 0.90 | |
| --max-model-len 8192 | |
| --trust-remote-code | |
| --enforce-eager | |
| deploy: | |
| resources: | |
| reservations: | |
| devices: | |
| - driver: nvidia | |
| count: 1 | |
| capabilities: [gpu] | |
| restart: unless-stopped | |
| volumes: | |
| hf-cache: | |