File size: 7,430 Bytes
c3d464b
 
 
 
3085e5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c3d464b
3085e5d
888f299
c3d464b
 
 
 
 
 
 
 
 
 
 
 
 
 
3085e5d
888f299
3085e5d
 
 
 
 
 
 
c3d464b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c74f490
 
c686855
c74f490
 
c686855
c3d464b
 
 
 
 
 
 
 
 
 
3085e5d
 
 
 
 
 
 
c3d464b
 
 
8b76adf
c3d464b
 
888f299
 
 
 
 
 
 
c3d464b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64a144d
c74f490
c3d464b
c686855
 
 
c3d464b
c74f490
 
c3d464b
 
 
 
 
c686855
 
c74f490
 
 
 
 
 
 
c3d464b
 
 
 
 
 
 
 
 
 
 
fa82766
 
 
 
 
 
 
2ff6d31
fa82766
2ff6d31
c3d464b
fa82766
 
 
 
 
 
 
 
 
c3d464b
 
 
 
 
 
 
 
 
 
 
 
c74f490
c686855
c74f490
 
d4eaeb4
 
 
c3d464b
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
#!/bin/bash

# Multi-GPU Neural OS Startup Script

# Function to detect number of GPUs automatically
detect_gpu_count() {
    if command -v nvidia-smi >/dev/null 2>&1; then
        # Use nvidia-smi to count GPUs
        local gpu_count=$(nvidia-smi -L 2>/dev/null | wc -l)
        if [ "$gpu_count" -gt 0 ]; then
            echo "$gpu_count"
            return 0
        fi
    fi
    
    # If nvidia-smi fails, try alternative methods
    if [ -d "/proc/driver/nvidia/gpus" ]; then
        local gpu_count=$(ls -d /proc/driver/nvidia/gpus/*/information 2>/dev/null | wc -l)
        if [ "$gpu_count" -gt 0 ]; then
            echo "$gpu_count"
            return 0
        fi
    fi
    
    # Default fallback
    echo "1"
    return 1
}

# Detect GPU count automatically
DETECTED_GPUS=$(detect_gpu_count)
GPU_DETECTION_SUCCESS=$?

# Default values
NUM_GPUS=$DETECTED_GPUS
DISPATCHER_PORT=7860

# Parse command line arguments
while [[ $# -gt 0 ]]; do
    case $1 in
        --num-gpus)
            NUM_GPUS="$2"
            shift 2
            ;;
        --port)
            DISPATCHER_PORT="$2"
            shift 2
            ;;
        -h|--help)
            echo "Usage: $0 [--num-gpus N] [--port PORT]"
            echo "  --num-gpus N    Number of GPU workers to start (default: auto-detected)"
            echo "  --port PORT     Dispatcher port (default: 7860)"
            echo ""
            echo "GPU Detection:"
            echo "  Automatically detects available GPUs using nvidia-smi"
            echo "  Currently detected: $DETECTED_GPUS GPU(s)"
            if [ $GPU_DETECTION_SUCCESS -ne 0 ]; then
                echo "  ⚠️  GPU detection failed - using fallback of 1 GPU"
            fi
            exit 0
            ;;
        *)
            echo "Unknown option: $1" >&2
            exit 1
            ;;
    esac
done

# Function to cleanup background processes
cleanup() {
    echo ""
    echo "πŸ›‘ Shutting down system..."
    
    # Kill dispatcher
    if [[ -n $DISPATCHER_PID ]]; then
        echo "Stopping dispatcher (PID: $DISPATCHER_PID)..."
        kill $DISPATCHER_PID 2>/dev/null
        wait $DISPATCHER_PID 2>/dev/null
    fi
    
    # Kill workers by finding their processes
    echo "Stopping workers..."
    pkill -f "python.*worker.py.*--worker-address" 2>/dev/null || true
    sleep 2
    # Force kill if any are still running
    pkill -9 -f "python.*worker.py.*--worker-address" 2>/dev/null || true
    
    echo "βœ… System stopped"
    exit 0
}

# Set up signal handlers
trap cleanup SIGINT SIGTERM

echo "πŸš€ Starting Multi-GPU Neural OS System"
echo "========================================"
echo "πŸ” GPU Detection: $DETECTED_GPUS GPU(s) detected"
if [ $GPU_DETECTION_SUCCESS -ne 0 ]; then
    echo "⚠️  GPU detection failed - using fallback count"
elif command -v nvidia-smi >/dev/null 2>&1; then
    echo "πŸ’Ž Detected GPUs:"
    nvidia-smi -L 2>/dev/null | sed 's/^/   /'
fi
echo "πŸ“Š Number of GPUs: $NUM_GPUS"
echo "🌐 Dispatcher port: $DISPATCHER_PORT" 
echo "πŸ’» Worker ports: $(seq -s', ' 8001 $((8000 + NUM_GPUS)))"
echo "πŸ“ˆ Analytics logging: system_analytics_$(date +%Y%m%d_%H%M%S).log"
echo ""

# Validate that we're not trying to start more workers than GPUs
if [ "$NUM_GPUS" -gt "$DETECTED_GPUS" ]; then
    echo "⚠️  Warning: Trying to start $NUM_GPUS workers but only $DETECTED_GPUS GPU(s) detected"
    echo "   This may cause GPU sharing or errors. Consider using --num-gpus $DETECTED_GPUS"
    echo ""
fi

# Check if required files exist
if [[ ! -f "dispatcher.py" ]]; then
    echo "❌ Error: dispatcher.py not found"
    exit 1
fi

if [[ ! -f "worker.py" ]]; then
    echo "❌ Error: worker.py not found"
    exit 1
fi

if [[ ! -f "start_workers.py" ]]; then
    echo "❌ Error: start_workers.py not found"
    exit 1
fi

# Start dispatcher
echo "🎯 Starting dispatcher..."
python dispatcher.py --port $DISPATCHER_PORT > dispatcher.log 2>&1 &
DISPATCHER_PID=$!

# Wait a bit for dispatcher to start
sleep 3

# Check if dispatcher started successfully
if ! kill -0 $DISPATCHER_PID 2>/dev/null; then
    echo "❌ Failed to start dispatcher. Check dispatcher.log for errors."
    exit 1
fi

echo "βœ… Dispatcher started (PID: $DISPATCHER_PID)"

# Start workers
echo "πŸ”§ Starting $NUM_GPUS GPU workers..."
python start_workers.py --num-gpus $NUM_GPUS --dispatcher-url "http://localhost:$DISPATCHER_PORT" --no-monitor > workers.log 2>&1
WORKER_START_EXIT_CODE=$?

# Wait for workers to fully load models and register (60 seconds)
echo "⏳ Waiting 60 seconds for workers to load models and register..."
sleep 60

# Check if workers started successfully by checking the exit code and log
if [ $WORKER_START_EXIT_CODE -ne 0 ]; then
    echo "❌ Failed to start workers. Check workers.log for errors."
    cleanup
    exit 1
fi

# Check if workers are actually running by looking for their processes (updated for new --worker-address format)
RUNNING_WORKERS=$(ps aux | grep -c "python.*worker.py.*--worker-address" || echo "0")
if [ "$RUNNING_WORKERS" -lt "$NUM_GPUS" ]; then
    echo "❌ Not all workers are running. Expected $NUM_GPUS, found $RUNNING_WORKERS. Check workers.log for errors."
    cleanup
    exit 1
fi

echo "βœ… Workers started successfully ($RUNNING_WORKERS workers running)"
echo ""
echo "πŸŽ‰ System is ready!"
echo "================================"
echo "🌍 Web interface: http://localhost:$DISPATCHER_PORT"
echo "πŸ“Š Dispatcher health: http://localhost:$DISPATCHER_PORT"
echo "πŸ”§ Worker health checks:"
for ((i=0; i<NUM_GPUS; i++)); do
    echo "   GPU $i: http://localhost:$((8001 + i))/health"
done
echo ""
echo "πŸ“‹ Log files:"
echo "   πŸ“Š Analytics (human-readable): system_analytics_*.log"
echo "   πŸ–₯️  GPU metrics (JSON): gpu_metrics_*.jsonl"
echo "   πŸ”— Connection events (JSON): connection_events_*.jsonl"
echo "   πŸ“ Queue metrics (JSON): queue_metrics_*.jsonl" 
echo "   🌍 IP statistics (JSON): ip_stats_*.jsonl"
echo "   🎯 Dispatcher: dispatcher.log"
echo "   πŸ”§ Workers summary: workers.log"
for ((i=0; i<NUM_GPUS; i++)); do
    echo "   πŸ–₯️  GPU $i worker: worker_gpu_$i.log"
done
echo ""
echo "πŸ’‘ Real-time monitoring:"
echo "   Human-readable: tail -f system_analytics_*.log"
echo "   GPU utilization: tail -f gpu_metrics_*.jsonl"
echo "   Connection events: tail -f connection_events_*.jsonl"
echo ""
echo "πŸ“ˆ Data analysis:"
echo "   Summary report: python analyze_analytics.py"
echo "   Last 6 hours: python analyze_analytics.py --since 6"
echo "   GPU analysis only: python analyze_analytics.py --type gpu"
echo "Press Ctrl+C to stop the system"
echo "================================"

# Keep the script running and wait for interrupt
while true; do
    # Check if processes are still running
    if ! kill -0 $DISPATCHER_PID 2>/dev/null; then
        echo "⚠️  Dispatcher process died unexpectedly"
        cleanup
        exit 1
    fi
    
    # Check if workers are still running
    CURRENT_WORKERS=$(ps aux | grep -c "python.*worker.py.*--worker-address" || echo "0")
    if [ "$CURRENT_WORKERS" -lt "$NUM_GPUS" ]; then
        echo "⚠️  Some workers died unexpectedly. Expected $NUM_GPUS, found $CURRENT_WORKERS"
        echo "πŸ”„ System will continue operating with reduced capacity"
        echo "πŸ’‘ Check worker logs for error details"
        # Don't exit - keep system running with remaining workers
    fi
    
    sleep 5
done