Spaces:
Runtime error
Runtime error
da03
commited on
Commit
Β·
c3d464b
1
Parent(s):
fe31dcf
- start_system.sh +147 -0
start_system.sh
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Multi-GPU Neural OS Startup Script
|
4 |
+
|
5 |
+
# Default values
|
6 |
+
NUM_GPUS=2
|
7 |
+
DISPATCHER_PORT=8000
|
8 |
+
|
9 |
+
# Parse command line arguments
|
10 |
+
while [[ $# -gt 0 ]]; do
|
11 |
+
case $1 in
|
12 |
+
--num-gpus)
|
13 |
+
NUM_GPUS="$2"
|
14 |
+
shift 2
|
15 |
+
;;
|
16 |
+
--port)
|
17 |
+
DISPATCHER_PORT="$2"
|
18 |
+
shift 2
|
19 |
+
;;
|
20 |
+
-h|--help)
|
21 |
+
echo "Usage: $0 [--num-gpus N] [--port PORT]"
|
22 |
+
echo " --num-gpus N Number of GPU workers to start (default: 2)"
|
23 |
+
echo " --port PORT Dispatcher port (default: 8000)"
|
24 |
+
exit 0
|
25 |
+
;;
|
26 |
+
*)
|
27 |
+
echo "Unknown option: $1" >&2
|
28 |
+
exit 1
|
29 |
+
;;
|
30 |
+
esac
|
31 |
+
done
|
32 |
+
|
33 |
+
# Function to cleanup background processes
|
34 |
+
cleanup() {
|
35 |
+
echo ""
|
36 |
+
echo "π Shutting down system..."
|
37 |
+
|
38 |
+
# Kill dispatcher
|
39 |
+
if [[ -n $DISPATCHER_PID ]]; then
|
40 |
+
echo "Stopping dispatcher (PID: $DISPATCHER_PID)..."
|
41 |
+
kill $DISPATCHER_PID 2>/dev/null
|
42 |
+
wait $DISPATCHER_PID 2>/dev/null
|
43 |
+
fi
|
44 |
+
|
45 |
+
# Kill workers
|
46 |
+
if [[ -n $WORKERS_PID ]]; then
|
47 |
+
echo "Stopping workers (PID: $WORKERS_PID)..."
|
48 |
+
kill $WORKERS_PID 2>/dev/null
|
49 |
+
wait $WORKERS_PID 2>/dev/null
|
50 |
+
fi
|
51 |
+
|
52 |
+
echo "β
System stopped"
|
53 |
+
exit 0
|
54 |
+
}
|
55 |
+
|
56 |
+
# Set up signal handlers
|
57 |
+
trap cleanup SIGINT SIGTERM
|
58 |
+
|
59 |
+
echo "π Starting Multi-GPU Neural OS System"
|
60 |
+
echo "========================================"
|
61 |
+
echo "π Number of GPUs: $NUM_GPUS"
|
62 |
+
echo "π Dispatcher port: $DISPATCHER_PORT"
|
63 |
+
echo "π» Worker ports: $(seq -s', ' 8001 $((8000 + NUM_GPUS)))"
|
64 |
+
echo ""
|
65 |
+
|
66 |
+
# Check if required files exist
|
67 |
+
if [[ ! -f "dispatcher.py" ]]; then
|
68 |
+
echo "β Error: dispatcher.py not found"
|
69 |
+
exit 1
|
70 |
+
fi
|
71 |
+
|
72 |
+
if [[ ! -f "worker.py" ]]; then
|
73 |
+
echo "β Error: worker.py not found"
|
74 |
+
exit 1
|
75 |
+
fi
|
76 |
+
|
77 |
+
if [[ ! -f "start_workers.py" ]]; then
|
78 |
+
echo "β Error: start_workers.py not found"
|
79 |
+
exit 1
|
80 |
+
fi
|
81 |
+
|
82 |
+
# Start dispatcher
|
83 |
+
echo "π― Starting dispatcher..."
|
84 |
+
python dispatcher.py --port $DISPATCHER_PORT > dispatcher.log 2>&1 &
|
85 |
+
DISPATCHER_PID=$!
|
86 |
+
|
87 |
+
# Wait a bit for dispatcher to start
|
88 |
+
sleep 3
|
89 |
+
|
90 |
+
# Check if dispatcher started successfully
|
91 |
+
if ! kill -0 $DISPATCHER_PID 2>/dev/null; then
|
92 |
+
echo "β Failed to start dispatcher. Check dispatcher.log for errors."
|
93 |
+
exit 1
|
94 |
+
fi
|
95 |
+
|
96 |
+
echo "β
Dispatcher started (PID: $DISPATCHER_PID)"
|
97 |
+
|
98 |
+
# Start workers
|
99 |
+
echo "π§ Starting $NUM_GPUS GPU workers..."
|
100 |
+
python start_workers.py --num-gpus $NUM_GPUS --no-monitor > workers.log 2>&1 &
|
101 |
+
WORKERS_PID=$!
|
102 |
+
|
103 |
+
# Wait a bit for workers to start
|
104 |
+
sleep 5
|
105 |
+
|
106 |
+
# Check if workers started successfully
|
107 |
+
if ! kill -0 $WORKERS_PID 2>/dev/null; then
|
108 |
+
echo "β Failed to start workers. Check workers.log for errors."
|
109 |
+
cleanup
|
110 |
+
exit 1
|
111 |
+
fi
|
112 |
+
|
113 |
+
echo "β
Workers started (PID: $WORKERS_PID)"
|
114 |
+
echo ""
|
115 |
+
echo "π System is ready!"
|
116 |
+
echo "================================"
|
117 |
+
echo "π Web interface: http://localhost:$DISPATCHER_PORT"
|
118 |
+
echo "π Dispatcher health: http://localhost:$DISPATCHER_PORT"
|
119 |
+
echo "π§ Worker health checks:"
|
120 |
+
for ((i=0; i<NUM_GPUS; i++)); do
|
121 |
+
echo " GPU $i: http://localhost:$((8001 + i))/health"
|
122 |
+
done
|
123 |
+
echo ""
|
124 |
+
echo "π Log files:"
|
125 |
+
echo " Dispatcher: dispatcher.log"
|
126 |
+
echo " Workers: workers.log"
|
127 |
+
echo ""
|
128 |
+
echo "Press Ctrl+C to stop the system"
|
129 |
+
echo "================================"
|
130 |
+
|
131 |
+
# Keep the script running and wait for interrupt
|
132 |
+
while true; do
|
133 |
+
# Check if processes are still running
|
134 |
+
if ! kill -0 $DISPATCHER_PID 2>/dev/null; then
|
135 |
+
echo "β οΈ Dispatcher process died unexpectedly"
|
136 |
+
cleanup
|
137 |
+
exit 1
|
138 |
+
fi
|
139 |
+
|
140 |
+
if ! kill -0 $WORKERS_PID 2>/dev/null; then
|
141 |
+
echo "β οΈ Workers process died unexpectedly"
|
142 |
+
cleanup
|
143 |
+
exit 1
|
144 |
+
fi
|
145 |
+
|
146 |
+
sleep 5
|
147 |
+
done
|