File size: 2,667 Bytes
096295a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/bin/bash

# Remote Worker Startup Script
# Usage: ./start_remote_worker.sh <dispatcher_ip> <local_ip> <num_gpus>

DISPATCHER_IP=${1:-"192.168.1.50"}
LOCAL_IP=${2:-$(hostname -I | awk '{print $1}')}
NUM_GPUS=${3:-1}
DISPATCHER_URL="http://${DISPATCHER_IP}:7860"

echo "πŸš€ Starting Remote GPU Workers"
echo "==============================="
echo "🌐 Dispatcher: $DISPATCHER_URL"
echo "πŸ“ Local IP: $LOCAL_IP"
echo "πŸ–₯️  GPUs: $NUM_GPUS"
echo ""

# Check if required files exist
REQUIRED_FILES=("worker.py" "utils.py" "latent_stats.json")
for file in "${REQUIRED_FILES[@]}"; do
    if [[ ! -f "$file" ]]; then
        echo "❌ Error: $file not found"
        echo "πŸ’‘ Copy required files from main machine:"
        echo "   scp user@dispatcher-machine:/path/to/{worker.py,utils.py,latent_stats.json,config_*.yaml} ."
        exit 1
    fi
done

# Test GPU access
echo "πŸ§ͺ Testing GPU access..."
python -c "import torch; print(f'βœ… CUDA available: {torch.cuda.is_available()}'); print(f'πŸ“Š GPU count: {torch.cuda.device_count()}')"

# Test dispatcher connectivity
echo "🌐 Testing dispatcher connectivity..."
if curl -s --connect-timeout 5 "$DISPATCHER_URL" > /dev/null; then
    echo "βœ… Dispatcher reachable"
else
    echo "❌ Cannot reach dispatcher at $DISPATCHER_URL"
    echo "πŸ’‘ Check network connectivity and dispatcher status"
    exit 1
fi

# Start workers
echo "πŸ”§ Starting $NUM_GPUS GPU workers..."
for ((i=0; i<NUM_GPUS; i++)); do
    PORT=$((8001 + i))
    WORKER_ADDRESS="${LOCAL_IP}:${PORT}"
    
    echo "Starting worker on GPU $i: $WORKER_ADDRESS"
    
    CUDA_VISIBLE_DEVICES=$i python worker.py \
        --worker-address "$WORKER_ADDRESS" \
        --dispatcher-url "$DISPATCHER_URL" \
        > "worker_gpu_${i}.log" 2>&1 &
    
    WORKER_PID=$!
    echo "βœ… Worker $i started (PID: $WORKER_PID)"
    
    # Small delay between starts
    sleep 2
done

echo ""
echo "πŸŽ‰ All workers started!"
echo "πŸ“‹ Monitor logs:"
for ((i=0; i<NUM_GPUS; i++)); do
    echo "   GPU $i: tail -f worker_gpu_${i}.log"
done
echo ""
echo "πŸ” Check worker health:"
for ((i=0; i<NUM_GPUS; i++)); do
    PORT=$((8001 + i))
    echo "   GPU $i: curl http://${LOCAL_IP}:${PORT}/health"
done
echo ""
echo "⚠️  To stop workers: pkill -f 'python.*worker.py'"
echo "Press Ctrl+C to continue monitoring or any key to exit..."

# Keep script running to show it's active
trap 'echo ""; echo "πŸ›‘ Stopping workers..."; pkill -f "python.*worker.py"; exit 0' SIGINT

# Show real-time worker status
while true; do
    sleep 10
    RUNNING=$(ps aux | grep -c "python.*worker.py" || echo "0")
    echo "$(date): $RUNNING workers running"
done