da03 commited on
Commit
c74f490
Β·
1 Parent(s): c3d464b
Files changed (2) hide show
  1. dispatcher.py +7 -1
  2. start_system.sh +25 -15
dispatcher.py CHANGED
@@ -438,4 +438,10 @@ async def startup_event():
438
 
439
  if __name__ == "__main__":
440
  import uvicorn
441
- uvicorn.run(app, host="0.0.0.0", port=8000)
 
 
 
 
 
 
 
438
 
439
  if __name__ == "__main__":
440
  import uvicorn
441
+ import argparse
442
+
443
+ parser = argparse.ArgumentParser(description="Dispatcher for Neural OS")
444
+ parser.add_argument("--port", type=int, default=8000, help="Port to run the dispatcher on")
445
+ args = parser.parse_args()
446
+
447
+ uvicorn.run(app, host="0.0.0.0", port=args.port)
start_system.sh CHANGED
@@ -42,12 +42,12 @@ cleanup() {
42
  wait $DISPATCHER_PID 2>/dev/null
43
  fi
44
 
45
- # Kill workers
46
- if [[ -n $WORKERS_PID ]]; then
47
- echo "Stopping workers (PID: $WORKERS_PID)..."
48
- kill $WORKERS_PID 2>/dev/null
49
- wait $WORKERS_PID 2>/dev/null
50
- fi
51
 
52
  echo "βœ… System stopped"
53
  exit 0
@@ -97,20 +97,28 @@ echo "βœ… Dispatcher started (PID: $DISPATCHER_PID)"
97
 
98
  # Start workers
99
  echo "πŸ”§ Starting $NUM_GPUS GPU workers..."
100
- python start_workers.py --num-gpus $NUM_GPUS --no-monitor > workers.log 2>&1 &
101
- WORKERS_PID=$!
102
 
103
- # Wait a bit for workers to start
104
- sleep 5
105
 
106
- # Check if workers started successfully
107
- if ! kill -0 $WORKERS_PID 2>/dev/null; then
108
  echo "❌ Failed to start workers. Check workers.log for errors."
109
  cleanup
110
  exit 1
111
  fi
112
 
113
- echo "βœ… Workers started (PID: $WORKERS_PID)"
 
 
 
 
 
 
 
 
114
  echo ""
115
  echo "πŸŽ‰ System is ready!"
116
  echo "================================"
@@ -137,8 +145,10 @@ while true; do
137
  exit 1
138
  fi
139
 
140
- if ! kill -0 $WORKERS_PID 2>/dev/null; then
141
- echo "⚠️ Workers process died unexpectedly"
 
 
142
  cleanup
143
  exit 1
144
  fi
 
42
  wait $DISPATCHER_PID 2>/dev/null
43
  fi
44
 
45
+ # Kill workers by finding their processes
46
+ echo "Stopping workers..."
47
+ pkill -f "python.*worker.py.*--gpu-id" 2>/dev/null || true
48
+ sleep 2
49
+ # Force kill if any are still running
50
+ pkill -9 -f "python.*worker.py.*--gpu-id" 2>/dev/null || true
51
 
52
  echo "βœ… System stopped"
53
  exit 0
 
97
 
98
  # Start workers
99
  echo "πŸ”§ Starting $NUM_GPUS GPU workers..."
100
+ python start_workers.py --num-gpus $NUM_GPUS --no-monitor > workers.log 2>&1
101
+ WORKER_START_EXIT_CODE=$?
102
 
103
+ # Wait a bit for workers to register
104
+ sleep 3
105
 
106
+ # Check if workers started successfully by checking the exit code and log
107
+ if [ $WORKER_START_EXIT_CODE -ne 0 ]; then
108
  echo "❌ Failed to start workers. Check workers.log for errors."
109
  cleanup
110
  exit 1
111
  fi
112
 
113
+ # Check if workers are actually running by looking for their processes
114
+ RUNNING_WORKERS=$(ps aux | grep -c "python.*worker.py.*--gpu-id" || echo "0")
115
+ if [ "$RUNNING_WORKERS" -lt "$NUM_GPUS" ]; then
116
+ echo "❌ Not all workers are running. Expected $NUM_GPUS, found $RUNNING_WORKERS. Check workers.log for errors."
117
+ cleanup
118
+ exit 1
119
+ fi
120
+
121
+ echo "βœ… Workers started successfully ($RUNNING_WORKERS workers running)"
122
  echo ""
123
  echo "πŸŽ‰ System is ready!"
124
  echo "================================"
 
145
  exit 1
146
  fi
147
 
148
+ # Check if workers are still running
149
+ CURRENT_WORKERS=$(ps aux | grep -c "python.*worker.py.*--gpu-id" || echo "0")
150
+ if [ "$CURRENT_WORKERS" -lt "$NUM_GPUS" ]; then
151
+ echo "⚠️ Some workers died unexpectedly. Expected $NUM_GPUS, found $CURRENT_WORKERS"
152
  cleanup
153
  exit 1
154
  fi