Spaces:
Runtime error
Runtime error
da03
commited on
Commit
·
d4eaeb4
1
Parent(s):
4bedca7
- dispatcher.py +61 -3
- start_system.sh +3 -2
- static/index.html +11 -0
dispatcher.py
CHANGED
@@ -864,10 +864,59 @@ class SessionManager:
|
|
864 |
except Exception as e:
|
865 |
logger.error(f"Error in system state validation: {e}")
|
866 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
867 |
async def _forward_to_worker(self, worker: WorkerInfo, session_id: str, data: dict):
|
868 |
"""Forward input to worker asynchronously"""
|
869 |
try:
|
870 |
-
async with aiohttp.ClientSession() as client_session:
|
871 |
async with client_session.post(
|
872 |
f"{worker.endpoint}/process_input",
|
873 |
json={
|
@@ -876,10 +925,15 @@ class SessionManager:
|
|
876 |
}
|
877 |
) as response:
|
878 |
if response.status != 200:
|
879 |
-
logger.error(f"Worker returned status {response.status}")
|
880 |
-
|
|
|
|
|
|
|
881 |
except Exception as e:
|
882 |
logger.error(f"Error forwarding to worker {worker.worker_id}: {e}")
|
|
|
|
|
883 |
|
884 |
# Global session manager
|
885 |
session_manager = SessionManager()
|
@@ -1110,6 +1164,10 @@ async def periodic_worker_health_check():
|
|
1110 |
|
1111 |
for worker_id, worker_address in disconnected_workers:
|
1112 |
analytics.log_worker_disconnected(worker_id, worker_address)
|
|
|
|
|
|
|
|
|
1113 |
del session_manager.workers[worker_id]
|
1114 |
logger.warning(f"Removed disconnected worker {worker_id} ({worker_address})")
|
1115 |
|
|
|
864 |
except Exception as e:
|
865 |
logger.error(f"Error in system state validation: {e}")
|
866 |
|
867 |
+
async def _handle_worker_failure(self, failed_worker_id: str):
|
868 |
+
"""Handle sessions when a worker fails - end sessions and put users back in queue"""
|
869 |
+
logger.warning(f"Handling failure of worker {failed_worker_id}")
|
870 |
+
|
871 |
+
# Find all sessions assigned to this worker
|
872 |
+
failed_sessions = []
|
873 |
+
for session_id, worker_id in list(self.active_sessions.items()):
|
874 |
+
if worker_id == failed_worker_id:
|
875 |
+
failed_sessions.append(session_id)
|
876 |
+
|
877 |
+
logger.warning(f"Found {len(failed_sessions)} sessions on failed worker {failed_worker_id}")
|
878 |
+
|
879 |
+
for session_id in failed_sessions:
|
880 |
+
session = self.sessions.get(session_id)
|
881 |
+
if session:
|
882 |
+
logger.info(f"Recovering session {session_id} from failed worker")
|
883 |
+
|
884 |
+
# Notify user about the worker failure
|
885 |
+
try:
|
886 |
+
await session.websocket.send_json({
|
887 |
+
"type": "worker_failure",
|
888 |
+
"message": "GPU worker failed. Reconnecting you to a healthy worker..."
|
889 |
+
})
|
890 |
+
except Exception as e:
|
891 |
+
logger.error(f"Failed to notify session {session_id} about worker failure: {e}")
|
892 |
+
|
893 |
+
# Remove from active sessions
|
894 |
+
if session_id in self.active_sessions:
|
895 |
+
del self.active_sessions[session_id]
|
896 |
+
|
897 |
+
# Reset session state and put back in queue
|
898 |
+
session.status = SessionStatus.QUEUED
|
899 |
+
session.worker_id = None
|
900 |
+
session.queue_start_time = time.time()
|
901 |
+
session.max_session_time = None # Reset time limits
|
902 |
+
session.session_limit_start_time = None
|
903 |
+
session.session_warning_sent = False
|
904 |
+
session.idle_warning_sent = False
|
905 |
+
|
906 |
+
# Add back to front of queue (they were already active)
|
907 |
+
if session_id not in self.session_queue:
|
908 |
+
self.session_queue.insert(0, session_id)
|
909 |
+
logger.info(f"Added session {session_id} to front of queue for recovery")
|
910 |
+
|
911 |
+
# Process queue to reassign recovered sessions to healthy workers
|
912 |
+
if failed_sessions:
|
913 |
+
logger.info(f"Processing queue to reassign {len(failed_sessions)} recovered sessions")
|
914 |
+
await self.process_queue()
|
915 |
+
|
916 |
async def _forward_to_worker(self, worker: WorkerInfo, session_id: str, data: dict):
|
917 |
"""Forward input to worker asynchronously"""
|
918 |
try:
|
919 |
+
async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=10)) as client_session:
|
920 |
async with client_session.post(
|
921 |
f"{worker.endpoint}/process_input",
|
922 |
json={
|
|
|
925 |
}
|
926 |
) as response:
|
927 |
if response.status != 200:
|
928 |
+
logger.error(f"Worker {worker.worker_id} returned status {response.status}")
|
929 |
+
except asyncio.TimeoutError:
|
930 |
+
logger.error(f"Worker {worker.worker_id} timeout - may be unresponsive")
|
931 |
+
# Mark worker as potentially dead for faster detection
|
932 |
+
worker.last_ping = 0 # This will cause it to be removed on next health check
|
933 |
except Exception as e:
|
934 |
logger.error(f"Error forwarding to worker {worker.worker_id}: {e}")
|
935 |
+
# Mark worker as potentially dead for faster detection
|
936 |
+
worker.last_ping = 0
|
937 |
|
938 |
# Global session manager
|
939 |
session_manager = SessionManager()
|
|
|
1164 |
|
1165 |
for worker_id, worker_address in disconnected_workers:
|
1166 |
analytics.log_worker_disconnected(worker_id, worker_address)
|
1167 |
+
|
1168 |
+
# Handle any active sessions on this dead worker
|
1169 |
+
await session_manager._handle_worker_failure(worker_id)
|
1170 |
+
|
1171 |
del session_manager.workers[worker_id]
|
1172 |
logger.warning(f"Removed disconnected worker {worker_id} ({worker_address})")
|
1173 |
|
start_system.sh
CHANGED
@@ -168,8 +168,9 @@ while true; do
|
|
168 |
CURRENT_WORKERS=$(ps aux | grep -c "python.*worker.py.*--worker-address" || echo "0")
|
169 |
if [ "$CURRENT_WORKERS" -lt "$NUM_GPUS" ]; then
|
170 |
echo "⚠️ Some workers died unexpectedly. Expected $NUM_GPUS, found $CURRENT_WORKERS"
|
171 |
-
|
172 |
-
|
|
|
173 |
fi
|
174 |
|
175 |
sleep 5
|
|
|
168 |
CURRENT_WORKERS=$(ps aux | grep -c "python.*worker.py.*--worker-address" || echo "0")
|
169 |
if [ "$CURRENT_WORKERS" -lt "$NUM_GPUS" ]; then
|
170 |
echo "⚠️ Some workers died unexpectedly. Expected $NUM_GPUS, found $CURRENT_WORKERS"
|
171 |
+
echo "🔄 System will continue operating with reduced capacity"
|
172 |
+
echo "💡 Check worker logs for error details"
|
173 |
+
# Don't exit - keep system running with remaining workers
|
174 |
fi
|
175 |
|
176 |
sleep 5
|
static/index.html
CHANGED
@@ -333,6 +333,17 @@
|
|
333 |
console.log(`Queue limit applied, ${data.time_remaining} seconds remaining`);
|
334 |
setTimeoutMessage(`⏰ Other users waiting. Time remaining: <span id="timeoutCountdown">${Math.ceil(data.time_remaining)}</span> seconds.`);
|
335 |
startTimeoutCountdown(Math.ceil(data.time_remaining), true); // true = hide stay connected button
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
336 |
}
|
337 |
};
|
338 |
}
|
|
|
333 |
console.log(`Queue limit applied, ${data.time_remaining} seconds remaining`);
|
334 |
setTimeoutMessage(`⏰ Other users waiting. Time remaining: <span id="timeoutCountdown">${Math.ceil(data.time_remaining)}</span> seconds.`);
|
335 |
startTimeoutCountdown(Math.ceil(data.time_remaining), true); // true = hide stay connected button
|
336 |
+
} else if (data.type === "worker_failure") {
|
337 |
+
console.log("Worker failure detected, reconnecting...");
|
338 |
+
showConnectionStatus("🔄 GPU worker failed. Reconnecting to healthy worker...");
|
339 |
+
// Clear the canvas to show we're reconnecting
|
340 |
+
ctx.clearRect(0, 0, canvas.width, canvas.height);
|
341 |
+
ctx.fillStyle = '#f0f0f0';
|
342 |
+
ctx.fillRect(0, 0, canvas.width, canvas.height);
|
343 |
+
ctx.fillStyle = '#666';
|
344 |
+
ctx.font = '20px Arial';
|
345 |
+
ctx.textAlign = 'center';
|
346 |
+
ctx.fillText('🔄 Reconnecting to healthy GPU...', canvas.width/2, canvas.height/2);
|
347 |
}
|
348 |
};
|
349 |
}
|