File size: 4,663 Bytes
c3d464b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c74f490
 
 
 
 
 
c3d464b
 
 
 
 
 
 
 
 
 
 
 
 
8b76adf
c3d464b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c74f490
 
c3d464b
c74f490
 
c3d464b
c74f490
 
c3d464b
 
 
 
 
c74f490
 
 
 
 
 
 
 
 
c3d464b
 
 
 
 
 
 
 
 
 
 
8b76adf
c3d464b
2ff6d31
 
 
 
c3d464b
8b76adf
c3d464b
 
 
 
 
 
 
 
 
 
 
 
c74f490
 
 
 
c3d464b
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/bin/bash

# Multi-GPU Neural OS Startup Script

# Default values
NUM_GPUS=2
DISPATCHER_PORT=8000

# Parse command line arguments
while [[ $# -gt 0 ]]; do
    case $1 in
        --num-gpus)
            NUM_GPUS="$2"
            shift 2
            ;;
        --port)
            DISPATCHER_PORT="$2"
            shift 2
            ;;
        -h|--help)
            echo "Usage: $0 [--num-gpus N] [--port PORT]"
            echo "  --num-gpus N    Number of GPU workers to start (default: 2)"
            echo "  --port PORT     Dispatcher port (default: 8000)"
            exit 0
            ;;
        *)
            echo "Unknown option: $1" >&2
            exit 1
            ;;
    esac
done

# Function to cleanup background processes
cleanup() {
    echo ""
    echo "πŸ›‘ Shutting down system..."
    
    # Kill dispatcher
    if [[ -n $DISPATCHER_PID ]]; then
        echo "Stopping dispatcher (PID: $DISPATCHER_PID)..."
        kill $DISPATCHER_PID 2>/dev/null
        wait $DISPATCHER_PID 2>/dev/null
    fi
    
    # Kill workers by finding their processes
    echo "Stopping workers..."
    pkill -f "python.*worker.py.*--gpu-id" 2>/dev/null || true
    sleep 2
    # Force kill if any are still running
    pkill -9 -f "python.*worker.py.*--gpu-id" 2>/dev/null || true
    
    echo "βœ… System stopped"
    exit 0
}

# Set up signal handlers
trap cleanup SIGINT SIGTERM

echo "πŸš€ Starting Multi-GPU Neural OS System"
echo "========================================"
echo "πŸ“Š Number of GPUs: $NUM_GPUS"
echo "🌐 Dispatcher port: $DISPATCHER_PORT" 
echo "πŸ’» Worker ports: $(seq -s', ' 8001 $((8000 + NUM_GPUS)))"
echo "πŸ“ˆ Analytics logging: system_analytics_$(date +%Y%m%d_%H%M%S).log"
echo ""

# Check if required files exist
if [[ ! -f "dispatcher.py" ]]; then
    echo "❌ Error: dispatcher.py not found"
    exit 1
fi

if [[ ! -f "worker.py" ]]; then
    echo "❌ Error: worker.py not found"
    exit 1
fi

if [[ ! -f "start_workers.py" ]]; then
    echo "❌ Error: start_workers.py not found"
    exit 1
fi

# Start dispatcher
echo "🎯 Starting dispatcher..."
python dispatcher.py --port $DISPATCHER_PORT > dispatcher.log 2>&1 &
DISPATCHER_PID=$!

# Wait a bit for dispatcher to start
sleep 3

# Check if dispatcher started successfully
if ! kill -0 $DISPATCHER_PID 2>/dev/null; then
    echo "❌ Failed to start dispatcher. Check dispatcher.log for errors."
    exit 1
fi

echo "βœ… Dispatcher started (PID: $DISPATCHER_PID)"

# Start workers
echo "πŸ”§ Starting $NUM_GPUS GPU workers..."
python start_workers.py --num-gpus $NUM_GPUS --no-monitor > workers.log 2>&1
WORKER_START_EXIT_CODE=$?

# Wait a bit for workers to register
sleep 3

# Check if workers started successfully by checking the exit code and log
if [ $WORKER_START_EXIT_CODE -ne 0 ]; then
    echo "❌ Failed to start workers. Check workers.log for errors."
    cleanup
    exit 1
fi

# Check if workers are actually running by looking for their processes
RUNNING_WORKERS=$(ps aux | grep -c "python.*worker.py.*--gpu-id" || echo "0")
if [ "$RUNNING_WORKERS" -lt "$NUM_GPUS" ]; then
    echo "❌ Not all workers are running. Expected $NUM_GPUS, found $RUNNING_WORKERS. Check workers.log for errors."
    cleanup
    exit 1
fi

echo "βœ… Workers started successfully ($RUNNING_WORKERS workers running)"
echo ""
echo "πŸŽ‰ System is ready!"
echo "================================"
echo "🌍 Web interface: http://localhost:$DISPATCHER_PORT"
echo "πŸ“Š Dispatcher health: http://localhost:$DISPATCHER_PORT"
echo "πŸ”§ Worker health checks:"
for ((i=0; i<NUM_GPUS; i++)); do
    echo "   GPU $i: http://localhost:$((8001 + i))/health"
done
echo ""
echo "πŸ“‹ Log files:"
echo "   System analytics: system_analytics_*.log (real-time monitoring)"
echo "   Dispatcher: dispatcher.log"
echo "   Workers summary: workers.log"
for ((i=0; i<NUM_GPUS; i++)); do
    echo "   GPU $i worker: worker_gpu_$i.log"
done
echo ""
echo "πŸ’‘ Monitor system in real-time: tail -f system_analytics_*.log"
echo "Press Ctrl+C to stop the system"
echo "================================"

# Keep the script running and wait for interrupt
while true; do
    # Check if processes are still running
    if ! kill -0 $DISPATCHER_PID 2>/dev/null; then
        echo "⚠️  Dispatcher process died unexpectedly"
        cleanup
        exit 1
    fi
    
    # Check if workers are still running
    CURRENT_WORKERS=$(ps aux | grep -c "python.*worker.py.*--gpu-id" || echo "0")
    if [ "$CURRENT_WORKERS" -lt "$NUM_GPUS" ]; then
        echo "⚠️  Some workers died unexpectedly. Expected $NUM_GPUS, found $CURRENT_WORKERS"
        cleanup
        exit 1
    fi
    
    sleep 5
done