neural-os / start_remote_worker.sh
da03
.
096295a
#!/bin/bash
# Remote Worker Startup Script
# Usage: ./start_remote_worker.sh <dispatcher_ip> <local_ip> <num_gpus>
DISPATCHER_IP=${1:-"192.168.1.50"}
LOCAL_IP=${2:-$(hostname -I | awk '{print $1}')}
NUM_GPUS=${3:-1}
DISPATCHER_URL="http://${DISPATCHER_IP}:7860"
echo "πŸš€ Starting Remote GPU Workers"
echo "==============================="
echo "🌐 Dispatcher: $DISPATCHER_URL"
echo "πŸ“ Local IP: $LOCAL_IP"
echo "πŸ–₯️ GPUs: $NUM_GPUS"
echo ""
# Check if required files exist
REQUIRED_FILES=("worker.py" "utils.py" "latent_stats.json")
for file in "${REQUIRED_FILES[@]}"; do
if [[ ! -f "$file" ]]; then
echo "❌ Error: $file not found"
echo "πŸ’‘ Copy required files from main machine:"
echo " scp user@dispatcher-machine:/path/to/{worker.py,utils.py,latent_stats.json,config_*.yaml} ."
exit 1
fi
done
# Test GPU access
echo "πŸ§ͺ Testing GPU access..."
python -c "import torch; print(f'βœ… CUDA available: {torch.cuda.is_available()}'); print(f'πŸ“Š GPU count: {torch.cuda.device_count()}')"
# Test dispatcher connectivity
echo "🌐 Testing dispatcher connectivity..."
if curl -s --connect-timeout 5 "$DISPATCHER_URL" > /dev/null; then
echo "βœ… Dispatcher reachable"
else
echo "❌ Cannot reach dispatcher at $DISPATCHER_URL"
echo "πŸ’‘ Check network connectivity and dispatcher status"
exit 1
fi
# Start workers
echo "πŸ”§ Starting $NUM_GPUS GPU workers..."
for ((i=0; i<NUM_GPUS; i++)); do
PORT=$((8001 + i))
WORKER_ADDRESS="${LOCAL_IP}:${PORT}"
echo "Starting worker on GPU $i: $WORKER_ADDRESS"
CUDA_VISIBLE_DEVICES=$i python worker.py \
--worker-address "$WORKER_ADDRESS" \
--dispatcher-url "$DISPATCHER_URL" \
> "worker_gpu_${i}.log" 2>&1 &
WORKER_PID=$!
echo "βœ… Worker $i started (PID: $WORKER_PID)"
# Small delay between starts
sleep 2
done
echo ""
echo "πŸŽ‰ All workers started!"
echo "πŸ“‹ Monitor logs:"
for ((i=0; i<NUM_GPUS; i++)); do
echo " GPU $i: tail -f worker_gpu_${i}.log"
done
echo ""
echo "πŸ” Check worker health:"
for ((i=0; i<NUM_GPUS; i++)); do
PORT=$((8001 + i))
echo " GPU $i: curl http://${LOCAL_IP}:${PORT}/health"
done
echo ""
echo "⚠️ To stop workers: pkill -f 'python.*worker.py'"
echo "Press Ctrl+C to continue monitoring or any key to exit..."
# Keep script running to show it's active
trap 'echo ""; echo "πŸ›‘ Stopping workers..."; pkill -f "python.*worker.py"; exit 0' SIGINT
# Show real-time worker status
while true; do
sleep 10
RUNNING=$(ps aux | grep -c "python.*worker.py" || echo "0")
echo "$(date): $RUNNING workers running"
done