Spaces:
Runtime error
Runtime error
File size: 2,667 Bytes
096295a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
#!/bin/bash
# Remote Worker Startup Script
# Usage: ./start_remote_worker.sh <dispatcher_ip> <local_ip> <num_gpus>
DISPATCHER_IP=${1:-"192.168.1.50"}
LOCAL_IP=${2:-$(hostname -I | awk '{print $1}')}
NUM_GPUS=${3:-1}
DISPATCHER_URL="http://${DISPATCHER_IP}:7860"
echo "π Starting Remote GPU Workers"
echo "==============================="
echo "π Dispatcher: $DISPATCHER_URL"
echo "π Local IP: $LOCAL_IP"
echo "π₯οΈ GPUs: $NUM_GPUS"
echo ""
# Check if required files exist
REQUIRED_FILES=("worker.py" "utils.py" "latent_stats.json")
for file in "${REQUIRED_FILES[@]}"; do
if [[ ! -f "$file" ]]; then
echo "β Error: $file not found"
echo "π‘ Copy required files from main machine:"
echo " scp user@dispatcher-machine:/path/to/{worker.py,utils.py,latent_stats.json,config_*.yaml} ."
exit 1
fi
done
# Test GPU access
echo "π§ͺ Testing GPU access..."
python -c "import torch; print(f'β
CUDA available: {torch.cuda.is_available()}'); print(f'π GPU count: {torch.cuda.device_count()}')"
# Test dispatcher connectivity
echo "π Testing dispatcher connectivity..."
if curl -s --connect-timeout 5 "$DISPATCHER_URL" > /dev/null; then
echo "β
Dispatcher reachable"
else
echo "β Cannot reach dispatcher at $DISPATCHER_URL"
echo "π‘ Check network connectivity and dispatcher status"
exit 1
fi
# Start workers
echo "π§ Starting $NUM_GPUS GPU workers..."
for ((i=0; i<NUM_GPUS; i++)); do
PORT=$((8001 + i))
WORKER_ADDRESS="${LOCAL_IP}:${PORT}"
echo "Starting worker on GPU $i: $WORKER_ADDRESS"
CUDA_VISIBLE_DEVICES=$i python worker.py \
--worker-address "$WORKER_ADDRESS" \
--dispatcher-url "$DISPATCHER_URL" \
> "worker_gpu_${i}.log" 2>&1 &
WORKER_PID=$!
echo "β
Worker $i started (PID: $WORKER_PID)"
# Small delay between starts
sleep 2
done
echo ""
echo "π All workers started!"
echo "π Monitor logs:"
for ((i=0; i<NUM_GPUS; i++)); do
echo " GPU $i: tail -f worker_gpu_${i}.log"
done
echo ""
echo "π Check worker health:"
for ((i=0; i<NUM_GPUS; i++)); do
PORT=$((8001 + i))
echo " GPU $i: curl http://${LOCAL_IP}:${PORT}/health"
done
echo ""
echo "β οΈ To stop workers: pkill -f 'python.*worker.py'"
echo "Press Ctrl+C to continue monitoring or any key to exit..."
# Keep script running to show it's active
trap 'echo ""; echo "π Stopping workers..."; pkill -f "python.*worker.py"; exit 0' SIGINT
# Show real-time worker status
while true; do
sleep 10
RUNNING=$(ps aux | grep -c "python.*worker.py" || echo "0")
echo "$(date): $RUNNING workers running"
done |