Spaces:
Runtime error
Runtime error
da03
commited on
Commit
Β·
c74f490
1
Parent(s):
c3d464b
- dispatcher.py +7 -1
- start_system.sh +25 -15
dispatcher.py
CHANGED
@@ -438,4 +438,10 @@ async def startup_event():
|
|
438 |
|
439 |
if __name__ == "__main__":
|
440 |
import uvicorn
|
441 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
438 |
|
439 |
if __name__ == "__main__":
|
440 |
import uvicorn
|
441 |
+
import argparse
|
442 |
+
|
443 |
+
parser = argparse.ArgumentParser(description="Dispatcher for Neural OS")
|
444 |
+
parser.add_argument("--port", type=int, default=8000, help="Port to run the dispatcher on")
|
445 |
+
args = parser.parse_args()
|
446 |
+
|
447 |
+
uvicorn.run(app, host="0.0.0.0", port=args.port)
|
start_system.sh
CHANGED
@@ -42,12 +42,12 @@ cleanup() {
|
|
42 |
wait $DISPATCHER_PID 2>/dev/null
|
43 |
fi
|
44 |
|
45 |
-
# Kill workers
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
|
52 |
echo "β
System stopped"
|
53 |
exit 0
|
@@ -97,20 +97,28 @@ echo "β
Dispatcher started (PID: $DISPATCHER_PID)"
|
|
97 |
|
98 |
# Start workers
|
99 |
echo "π§ Starting $NUM_GPUS GPU workers..."
|
100 |
-
python start_workers.py --num-gpus $NUM_GPUS --no-monitor > workers.log 2>&1
|
101 |
-
|
102 |
|
103 |
-
# Wait a bit for workers to
|
104 |
-
sleep
|
105 |
|
106 |
-
# Check if workers started successfully
|
107 |
-
if
|
108 |
echo "β Failed to start workers. Check workers.log for errors."
|
109 |
cleanup
|
110 |
exit 1
|
111 |
fi
|
112 |
|
113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
echo ""
|
115 |
echo "π System is ready!"
|
116 |
echo "================================"
|
@@ -137,8 +145,10 @@ while true; do
|
|
137 |
exit 1
|
138 |
fi
|
139 |
|
140 |
-
|
141 |
-
|
|
|
|
|
142 |
cleanup
|
143 |
exit 1
|
144 |
fi
|
|
|
42 |
wait $DISPATCHER_PID 2>/dev/null
|
43 |
fi
|
44 |
|
45 |
+
# Kill workers by finding their processes
|
46 |
+
echo "Stopping workers..."
|
47 |
+
pkill -f "python.*worker.py.*--gpu-id" 2>/dev/null || true
|
48 |
+
sleep 2
|
49 |
+
# Force kill if any are still running
|
50 |
+
pkill -9 -f "python.*worker.py.*--gpu-id" 2>/dev/null || true
|
51 |
|
52 |
echo "β
System stopped"
|
53 |
exit 0
|
|
|
97 |
|
98 |
# Start workers
|
99 |
echo "π§ Starting $NUM_GPUS GPU workers..."
|
100 |
+
python start_workers.py --num-gpus $NUM_GPUS --no-monitor > workers.log 2>&1
|
101 |
+
WORKER_START_EXIT_CODE=$?
|
102 |
|
103 |
+
# Wait a bit for workers to register
|
104 |
+
sleep 3
|
105 |
|
106 |
+
# Check if workers started successfully by checking the exit code and log
|
107 |
+
if [ $WORKER_START_EXIT_CODE -ne 0 ]; then
|
108 |
echo "β Failed to start workers. Check workers.log for errors."
|
109 |
cleanup
|
110 |
exit 1
|
111 |
fi
|
112 |
|
113 |
+
# Check if workers are actually running by looking for their processes
|
114 |
+
RUNNING_WORKERS=$(ps aux | grep -c "python.*worker.py.*--gpu-id" || echo "0")
|
115 |
+
if [ "$RUNNING_WORKERS" -lt "$NUM_GPUS" ]; then
|
116 |
+
echo "β Not all workers are running. Expected $NUM_GPUS, found $RUNNING_WORKERS. Check workers.log for errors."
|
117 |
+
cleanup
|
118 |
+
exit 1
|
119 |
+
fi
|
120 |
+
|
121 |
+
echo "β
Workers started successfully ($RUNNING_WORKERS workers running)"
|
122 |
echo ""
|
123 |
echo "π System is ready!"
|
124 |
echo "================================"
|
|
|
145 |
exit 1
|
146 |
fi
|
147 |
|
148 |
+
# Check if workers are still running
|
149 |
+
CURRENT_WORKERS=$(ps aux | grep -c "python.*worker.py.*--gpu-id" || echo "0")
|
150 |
+
if [ "$CURRENT_WORKERS" -lt "$NUM_GPUS" ]; then
|
151 |
+
echo "β οΈ Some workers died unexpectedly. Expected $NUM_GPUS, found $CURRENT_WORKERS"
|
152 |
cleanup
|
153 |
exit 1
|
154 |
fi
|