Spaces:
Runtime error
Runtime error
da03
commited on
Commit
·
c686855
1
Parent(s):
1173f9e
- start_system.sh +8 -7
start_system.sh
CHANGED
@@ -44,10 +44,10 @@ cleanup() {
|
|
44 |
|
45 |
# Kill workers by finding their processes
|
46 |
echo "Stopping workers..."
|
47 |
-
pkill -f "python.*worker.py.*--
|
48 |
sleep 2
|
49 |
# Force kill if any are still running
|
50 |
-
pkill -9 -f "python.*worker.py.*--
|
51 |
|
52 |
echo "✅ System stopped"
|
53 |
exit 0
|
@@ -101,8 +101,9 @@ echo "🔧 Starting $NUM_GPUS GPU workers..."
|
|
101 |
python start_workers.py --num-gpus $NUM_GPUS --no-monitor > workers.log 2>&1
|
102 |
WORKER_START_EXIT_CODE=$?
|
103 |
|
104 |
-
# Wait
|
105 |
-
|
|
|
106 |
|
107 |
# Check if workers started successfully by checking the exit code and log
|
108 |
if [ $WORKER_START_EXIT_CODE -ne 0 ]; then
|
@@ -111,8 +112,8 @@ if [ $WORKER_START_EXIT_CODE -ne 0 ]; then
|
|
111 |
exit 1
|
112 |
fi
|
113 |
|
114 |
-
# Check if workers are actually running by looking for their processes
|
115 |
-
RUNNING_WORKERS=$(ps aux | grep -c "python.*worker.py.*--
|
116 |
if [ "$RUNNING_WORKERS" -lt "$NUM_GPUS" ]; then
|
117 |
echo "❌ Not all workers are running. Expected $NUM_GPUS, found $RUNNING_WORKERS. Check workers.log for errors."
|
118 |
cleanup
|
@@ -164,7 +165,7 @@ while true; do
|
|
164 |
fi
|
165 |
|
166 |
# Check if workers are still running
|
167 |
-
CURRENT_WORKERS=$(ps aux | grep -c "python.*worker.py.*--
|
168 |
if [ "$CURRENT_WORKERS" -lt "$NUM_GPUS" ]; then
|
169 |
echo "⚠️ Some workers died unexpectedly. Expected $NUM_GPUS, found $CURRENT_WORKERS"
|
170 |
cleanup
|
|
|
44 |
|
45 |
# Kill workers by finding their processes
|
46 |
echo "Stopping workers..."
|
47 |
+
pkill -f "python.*worker.py.*--worker-address" 2>/dev/null || true
|
48 |
sleep 2
|
49 |
# Force kill if any are still running
|
50 |
+
pkill -9 -f "python.*worker.py.*--worker-address" 2>/dev/null || true
|
51 |
|
52 |
echo "✅ System stopped"
|
53 |
exit 0
|
|
|
101 |
python start_workers.py --num-gpus $NUM_GPUS --no-monitor > workers.log 2>&1
|
102 |
WORKER_START_EXIT_CODE=$?
|
103 |
|
104 |
+
# Wait for workers to fully load models and register (60 seconds)
|
105 |
+
echo "⏳ Waiting 60 seconds for workers to load models and register..."
|
106 |
+
sleep 60
|
107 |
|
108 |
# Check if workers started successfully by checking the exit code and log
|
109 |
if [ $WORKER_START_EXIT_CODE -ne 0 ]; then
|
|
|
112 |
exit 1
|
113 |
fi
|
114 |
|
115 |
+
# Check if workers are actually running by looking for their processes (updated for new --worker-address format)
|
116 |
+
RUNNING_WORKERS=$(ps aux | grep -c "python.*worker.py.*--worker-address" || echo "0")
|
117 |
if [ "$RUNNING_WORKERS" -lt "$NUM_GPUS" ]; then
|
118 |
echo "❌ Not all workers are running. Expected $NUM_GPUS, found $RUNNING_WORKERS. Check workers.log for errors."
|
119 |
cleanup
|
|
|
165 |
fi
|
166 |
|
167 |
# Check if workers are still running
|
168 |
+
CURRENT_WORKERS=$(ps aux | grep -c "python.*worker.py.*--worker-address" || echo "0")
|
169 |
if [ "$CURRENT_WORKERS" -lt "$NUM_GPUS" ]; then
|
170 |
echo "⚠️ Some workers died unexpectedly. Expected $NUM_GPUS, found $CURRENT_WORKERS"
|
171 |
cleanup
|