File size: 5,270 Bytes
c3d464b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c74f490
 
 
 
 
 
c3d464b
 
 
 
 
 
 
 
 
 
 
 
 
8b76adf
c3d464b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c74f490
 
c3d464b
c74f490
 
c3d464b
c74f490
 
c3d464b
 
 
 
 
c74f490
 
 
 
 
 
 
 
 
c3d464b
 
 
 
 
 
 
 
 
 
 
fa82766
 
 
 
 
 
 
2ff6d31
fa82766
2ff6d31
c3d464b
fa82766
 
 
 
 
 
 
 
 
c3d464b
 
 
 
 
 
 
 
 
 
 
 
c74f490
 
 
 
c3d464b
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#!/bin/bash

# Multi-GPU Neural OS Startup Script

# Default values
NUM_GPUS=2
DISPATCHER_PORT=8000

# Parse command line arguments
while [[ $# -gt 0 ]]; do
    case $1 in
        --num-gpus)
            NUM_GPUS="$2"
            shift 2
            ;;
        --port)
            DISPATCHER_PORT="$2"
            shift 2
            ;;
        -h|--help)
            echo "Usage: $0 [--num-gpus N] [--port PORT]"
            echo "  --num-gpus N    Number of GPU workers to start (default: 2)"
            echo "  --port PORT     Dispatcher port (default: 8000)"
            exit 0
            ;;
        *)
            echo "Unknown option: $1" >&2
            exit 1
            ;;
    esac
done

# Function to cleanup background processes
cleanup() {
    echo ""
    echo "πŸ›‘ Shutting down system..."
    
    # Kill dispatcher
    if [[ -n $DISPATCHER_PID ]]; then
        echo "Stopping dispatcher (PID: $DISPATCHER_PID)..."
        kill $DISPATCHER_PID 2>/dev/null
        wait $DISPATCHER_PID 2>/dev/null
    fi
    
    # Kill workers by finding their processes
    echo "Stopping workers..."
    pkill -f "python.*worker.py.*--gpu-id" 2>/dev/null || true
    sleep 2
    # Force kill if any are still running
    pkill -9 -f "python.*worker.py.*--gpu-id" 2>/dev/null || true
    
    echo "βœ… System stopped"
    exit 0
}

# Set up signal handlers
trap cleanup SIGINT SIGTERM

echo "πŸš€ Starting Multi-GPU Neural OS System"
echo "========================================"
echo "πŸ“Š Number of GPUs: $NUM_GPUS"
echo "🌐 Dispatcher port: $DISPATCHER_PORT" 
echo "πŸ’» Worker ports: $(seq -s', ' 8001 $((8000 + NUM_GPUS)))"
echo "πŸ“ˆ Analytics logging: system_analytics_$(date +%Y%m%d_%H%M%S).log"
echo ""

# Check if required files exist
if [[ ! -f "dispatcher.py" ]]; then
    echo "❌ Error: dispatcher.py not found"
    exit 1
fi

if [[ ! -f "worker.py" ]]; then
    echo "❌ Error: worker.py not found"
    exit 1
fi

if [[ ! -f "start_workers.py" ]]; then
    echo "❌ Error: start_workers.py not found"
    exit 1
fi

# Start dispatcher
echo "🎯 Starting dispatcher..."
python dispatcher.py --port $DISPATCHER_PORT > dispatcher.log 2>&1 &
DISPATCHER_PID=$!

# Wait a bit for dispatcher to start
sleep 3

# Check if dispatcher started successfully
if ! kill -0 $DISPATCHER_PID 2>/dev/null; then
    echo "❌ Failed to start dispatcher. Check dispatcher.log for errors."
    exit 1
fi

echo "βœ… Dispatcher started (PID: $DISPATCHER_PID)"

# Start workers
echo "πŸ”§ Starting $NUM_GPUS GPU workers..."
python start_workers.py --num-gpus $NUM_GPUS --no-monitor > workers.log 2>&1
WORKER_START_EXIT_CODE=$?

# Wait a bit for workers to register
sleep 3

# Check if workers started successfully by checking the exit code and log
if [ $WORKER_START_EXIT_CODE -ne 0 ]; then
    echo "❌ Failed to start workers. Check workers.log for errors."
    cleanup
    exit 1
fi

# Check if workers are actually running by looking for their processes
RUNNING_WORKERS=$(ps aux | grep -c "python.*worker.py.*--gpu-id" || echo "0")
if [ "$RUNNING_WORKERS" -lt "$NUM_GPUS" ]; then
    echo "❌ Not all workers are running. Expected $NUM_GPUS, found $RUNNING_WORKERS. Check workers.log for errors."
    cleanup
    exit 1
fi

echo "βœ… Workers started successfully ($RUNNING_WORKERS workers running)"
echo ""
echo "πŸŽ‰ System is ready!"
echo "================================"
echo "🌍 Web interface: http://localhost:$DISPATCHER_PORT"
echo "πŸ“Š Dispatcher health: http://localhost:$DISPATCHER_PORT"
echo "πŸ”§ Worker health checks:"
for ((i=0; i<NUM_GPUS; i++)); do
    echo "   GPU $i: http://localhost:$((8001 + i))/health"
done
echo ""
echo "πŸ“‹ Log files:"
echo "   πŸ“Š Analytics (human-readable): system_analytics_*.log"
echo "   πŸ–₯️  GPU metrics (JSON): gpu_metrics_*.jsonl"
echo "   πŸ”— Connection events (JSON): connection_events_*.jsonl"
echo "   πŸ“ Queue metrics (JSON): queue_metrics_*.jsonl" 
echo "   🌍 IP statistics (JSON): ip_stats_*.jsonl"
echo "   🎯 Dispatcher: dispatcher.log"
echo "   πŸ”§ Workers summary: workers.log"
for ((i=0; i<NUM_GPUS; i++)); do
    echo "   πŸ–₯️  GPU $i worker: worker_gpu_$i.log"
done
echo ""
echo "πŸ’‘ Real-time monitoring:"
echo "   Human-readable: tail -f system_analytics_*.log"
echo "   GPU utilization: tail -f gpu_metrics_*.jsonl"
echo "   Connection events: tail -f connection_events_*.jsonl"
echo ""
echo "πŸ“ˆ Data analysis:"
echo "   Summary report: python analyze_analytics.py"
echo "   Last 6 hours: python analyze_analytics.py --since 6"
echo "   GPU analysis only: python analyze_analytics.py --type gpu"
echo "Press Ctrl+C to stop the system"
echo "================================"

# Keep the script running and wait for interrupt
while true; do
    # Check if processes are still running
    if ! kill -0 $DISPATCHER_PID 2>/dev/null; then
        echo "⚠️  Dispatcher process died unexpectedly"
        cleanup
        exit 1
    fi
    
    # Check if workers are still running
    CURRENT_WORKERS=$(ps aux | grep -c "python.*worker.py.*--gpu-id" || echo "0")
    if [ "$CURRENT_WORKERS" -lt "$NUM_GPUS" ]; then
        echo "⚠️  Some workers died unexpectedly. Expected $NUM_GPUS, found $CURRENT_WORKERS"
        cleanup
        exit 1
    fi
    
    sleep 5
done