Spaces:
Runtime error
Runtime error
# Remote Worker Startup Script | |
# Usage: ./start_remote_worker.sh <dispatcher_ip> <local_ip> <num_gpus> | |
DISPATCHER_IP=${1:-"192.168.1.50"} | |
LOCAL_IP=${2:-$(hostname -I | awk '{print $1}')} | |
NUM_GPUS=${3:-1} | |
DISPATCHER_URL="http://${DISPATCHER_IP}:7860" | |
echo "π Starting Remote GPU Workers" | |
echo "===============================" | |
echo "π Dispatcher: $DISPATCHER_URL" | |
echo "π Local IP: $LOCAL_IP" | |
echo "π₯οΈ GPUs: $NUM_GPUS" | |
echo "" | |
# Check if required files exist | |
REQUIRED_FILES=("worker.py" "utils.py" "latent_stats.json") | |
for file in "${REQUIRED_FILES[@]}"; do | |
if [[ ! -f "$file" ]]; then | |
echo "β Error: $file not found" | |
echo "π‘ Copy required files from main machine:" | |
echo " scp user@dispatcher-machine:/path/to/{worker.py,utils.py,latent_stats.json,config_*.yaml} ." | |
exit 1 | |
fi | |
done | |
# Test GPU access | |
echo "π§ͺ Testing GPU access..." | |
python -c "import torch; print(f'β CUDA available: {torch.cuda.is_available()}'); print(f'π GPU count: {torch.cuda.device_count()}')" | |
# Test dispatcher connectivity | |
echo "π Testing dispatcher connectivity..." | |
if curl -s --connect-timeout 5 "$DISPATCHER_URL" > /dev/null; then | |
echo "β Dispatcher reachable" | |
else | |
echo "β Cannot reach dispatcher at $DISPATCHER_URL" | |
echo "π‘ Check network connectivity and dispatcher status" | |
exit 1 | |
fi | |
# Start workers | |
echo "π§ Starting $NUM_GPUS GPU workers..." | |
for ((i=0; i<NUM_GPUS; i++)); do | |
PORT=$((8001 + i)) | |
WORKER_ADDRESS="${LOCAL_IP}:${PORT}" | |
echo "Starting worker on GPU $i: $WORKER_ADDRESS" | |
CUDA_VISIBLE_DEVICES=$i python worker.py \ | |
--worker-address "$WORKER_ADDRESS" \ | |
--dispatcher-url "$DISPATCHER_URL" \ | |
> "worker_gpu_${i}.log" 2>&1 & | |
WORKER_PID=$! | |
echo "β Worker $i started (PID: $WORKER_PID)" | |
# Small delay between starts | |
sleep 2 | |
done | |
echo "" | |
echo "π All workers started!" | |
echo "π Monitor logs:" | |
for ((i=0; i<NUM_GPUS; i++)); do | |
echo " GPU $i: tail -f worker_gpu_${i}.log" | |
done | |
echo "" | |
echo "π Check worker health:" | |
for ((i=0; i<NUM_GPUS; i++)); do | |
PORT=$((8001 + i)) | |
echo " GPU $i: curl http://${LOCAL_IP}:${PORT}/health" | |
done | |
echo "" | |
echo "β οΈ To stop workers: pkill -f 'python.*worker.py'" | |
echo "Press Ctrl+C to continue monitoring or any key to exit..." | |
# Keep script running to show it's active | |
trap 'echo ""; echo "π Stopping workers..."; pkill -f "python.*worker.py"; exit 0' SIGINT | |
# Show real-time worker status | |
while true; do | |
sleep 10 | |
RUNNING=$(ps aux | grep -c "python.*worker.py" || echo "0") | |
echo "$(date): $RUNNING workers running" | |
done |