da03 commited on
Commit
c686855
·
1 Parent(s): 1173f9e
Files changed (1) hide show
  1. start_system.sh +8 -7
start_system.sh CHANGED
@@ -44,10 +44,10 @@ cleanup() {
44
 
45
  # Kill workers by finding their processes
46
  echo "Stopping workers..."
47
- pkill -f "python.*worker.py.*--gpu-id" 2>/dev/null || true
48
  sleep 2
49
  # Force kill if any are still running
50
- pkill -9 -f "python.*worker.py.*--gpu-id" 2>/dev/null || true
51
 
52
  echo "✅ System stopped"
53
  exit 0
@@ -101,8 +101,9 @@ echo "🔧 Starting $NUM_GPUS GPU workers..."
101
  python start_workers.py --num-gpus $NUM_GPUS --no-monitor > workers.log 2>&1
102
  WORKER_START_EXIT_CODE=$?
103
 
104
- # Wait a bit for workers to register
105
- sleep 3
 
106
 
107
  # Check if workers started successfully by checking the exit code and log
108
  if [ $WORKER_START_EXIT_CODE -ne 0 ]; then
@@ -111,8 +112,8 @@ if [ $WORKER_START_EXIT_CODE -ne 0 ]; then
111
  exit 1
112
  fi
113
 
114
- # Check if workers are actually running by looking for their processes
115
- RUNNING_WORKERS=$(ps aux | grep -c "python.*worker.py.*--gpu-id" || echo "0")
116
  if [ "$RUNNING_WORKERS" -lt "$NUM_GPUS" ]; then
117
  echo "❌ Not all workers are running. Expected $NUM_GPUS, found $RUNNING_WORKERS. Check workers.log for errors."
118
  cleanup
@@ -164,7 +165,7 @@ while true; do
164
  fi
165
 
166
  # Check if workers are still running
167
- CURRENT_WORKERS=$(ps aux | grep -c "python.*worker.py.*--gpu-id" || echo "0")
168
  if [ "$CURRENT_WORKERS" -lt "$NUM_GPUS" ]; then
169
  echo "⚠️ Some workers died unexpectedly. Expected $NUM_GPUS, found $CURRENT_WORKERS"
170
  cleanup
 
44
 
45
  # Kill workers by finding their processes
46
  echo "Stopping workers..."
47
+ pkill -f "python.*worker.py.*--worker-address" 2>/dev/null || true
48
  sleep 2
49
  # Force kill if any are still running
50
+ pkill -9 -f "python.*worker.py.*--worker-address" 2>/dev/null || true
51
 
52
  echo "✅ System stopped"
53
  exit 0
 
101
  python start_workers.py --num-gpus $NUM_GPUS --no-monitor > workers.log 2>&1
102
  WORKER_START_EXIT_CODE=$?
103
 
104
+ # Wait for workers to fully load models and register (60 seconds)
105
+ echo "⏳ Waiting 60 seconds for workers to load models and register..."
106
+ sleep 60
107
 
108
  # Check if workers started successfully by checking the exit code and log
109
  if [ $WORKER_START_EXIT_CODE -ne 0 ]; then
 
112
  exit 1
113
  fi
114
 
115
+ # Check if workers are actually running by looking for their processes (updated for new --worker-address format)
116
+ RUNNING_WORKERS=$(ps aux | grep -c "python.*worker.py.*--worker-address" || echo "0")
117
  if [ "$RUNNING_WORKERS" -lt "$NUM_GPUS" ]; then
118
  echo "❌ Not all workers are running. Expected $NUM_GPUS, found $RUNNING_WORKERS. Check workers.log for errors."
119
  cleanup
 
165
  fi
166
 
167
  # Check if workers are still running
168
+ CURRENT_WORKERS=$(ps aux | grep -c "python.*worker.py.*--worker-address" || echo "0")
169
  if [ "$CURRENT_WORKERS" -lt "$NUM_GPUS" ]; then
170
  echo "⚠️ Some workers died unexpectedly. Expected $NUM_GPUS, found $CURRENT_WORKERS"
171
  cleanup