Spaces:
Runtime error
Runtime error
da03
commited on
Commit
Β·
1173f9e
1
Parent(s):
92199b3
- dispatcher.py +30 -7
- worker.py +58 -9
dispatcher.py
CHANGED
|
@@ -866,12 +866,22 @@ async def get():
|
|
| 866 |
@app.post("/register_worker")
|
| 867 |
async def register_worker(worker_info: dict):
|
| 868 |
"""Endpoint for workers to register themselves"""
|
| 869 |
-
|
| 870 |
-
|
| 871 |
-
|
| 872 |
-
|
| 873 |
-
|
| 874 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 875 |
|
| 876 |
@app.post("/worker_ping")
|
| 877 |
async def worker_ping(worker_info: dict):
|
|
@@ -1084,6 +1094,8 @@ async def periodic_worker_health_check():
|
|
| 1084 |
|
| 1085 |
@app.on_event("startup")
|
| 1086 |
async def startup_event():
|
|
|
|
|
|
|
| 1087 |
# Start background tasks
|
| 1088 |
asyncio.create_task(periodic_queue_update())
|
| 1089 |
asyncio.create_task(periodic_system_validation())
|
|
@@ -1093,6 +1105,8 @@ async def startup_event():
|
|
| 1093 |
# Log initial system status
|
| 1094 |
analytics._write_log("π System initialized and ready to accept connections")
|
| 1095 |
analytics._write_log(" Waiting for GPU workers to register...")
|
|
|
|
|
|
|
| 1096 |
|
| 1097 |
@app.on_event("shutdown")
|
| 1098 |
async def shutdown_event():
|
|
@@ -1110,4 +1124,13 @@ if __name__ == "__main__":
|
|
| 1110 |
parser.add_argument("--port", type=int, default=8000, help="Port to run the dispatcher on")
|
| 1111 |
args = parser.parse_args()
|
| 1112 |
|
| 1113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 866 |
@app.post("/register_worker")
|
| 867 |
async def register_worker(worker_info: dict):
|
| 868 |
"""Endpoint for workers to register themselves"""
|
| 869 |
+
logger.info(f"π₯ Received worker registration request")
|
| 870 |
+
logger.info(f"π Worker info: {worker_info}")
|
| 871 |
+
|
| 872 |
+
try:
|
| 873 |
+
await session_manager.register_worker(
|
| 874 |
+
worker_info["worker_id"],
|
| 875 |
+
worker_info["worker_address"],
|
| 876 |
+
worker_info["endpoint"]
|
| 877 |
+
)
|
| 878 |
+
logger.info(f"β
Successfully processed worker registration")
|
| 879 |
+
return {"status": "registered"}
|
| 880 |
+
except Exception as e:
|
| 881 |
+
logger.error(f"β Failed to register worker: {e}")
|
| 882 |
+
import traceback
|
| 883 |
+
logger.error(f"π Full traceback: {traceback.format_exc()}")
|
| 884 |
+
raise
|
| 885 |
|
| 886 |
@app.post("/worker_ping")
|
| 887 |
async def worker_ping(worker_info: dict):
|
|
|
|
| 1094 |
|
| 1095 |
@app.on_event("startup")
|
| 1096 |
async def startup_event():
|
| 1097 |
+
logger.info("π Dispatcher startup event triggered")
|
| 1098 |
+
|
| 1099 |
# Start background tasks
|
| 1100 |
asyncio.create_task(periodic_queue_update())
|
| 1101 |
asyncio.create_task(periodic_system_validation())
|
|
|
|
| 1105 |
# Log initial system status
|
| 1106 |
analytics._write_log("π System initialized and ready to accept connections")
|
| 1107 |
analytics._write_log(" Waiting for GPU workers to register...")
|
| 1108 |
+
|
| 1109 |
+
logger.info("β
Dispatcher startup complete - ready to accept worker registrations")
|
| 1110 |
|
| 1111 |
@app.on_event("shutdown")
|
| 1112 |
async def shutdown_event():
|
|
|
|
| 1124 |
parser.add_argument("--port", type=int, default=8000, help="Port to run the dispatcher on")
|
| 1125 |
args = parser.parse_args()
|
| 1126 |
|
| 1127 |
+
logger.info(f"π Starting dispatcher on 0.0.0.0:{args.port}")
|
| 1128 |
+
logger.info(f"π Dispatcher will be available at http://localhost:{args.port}")
|
| 1129 |
+
|
| 1130 |
+
try:
|
| 1131 |
+
uvicorn.run(app, host="0.0.0.0", port=args.port)
|
| 1132 |
+
except Exception as e:
|
| 1133 |
+
logger.error(f"β Failed to start dispatcher: {e}")
|
| 1134 |
+
import traceback
|
| 1135 |
+
logger.error(f"π Full traceback: {traceback.format_exc()}")
|
| 1136 |
+
raise
|
worker.py
CHANGED
|
@@ -154,16 +154,47 @@ class GPUWorker:
|
|
| 154 |
|
| 155 |
async def register_with_dispatcher(self):
|
| 156 |
"""Register this worker with the dispatcher"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
try:
|
| 158 |
async with aiohttp.ClientSession() as session:
|
| 159 |
-
|
| 160 |
-
"
|
| 161 |
-
"worker_address": self.worker_address,
|
| 162 |
-
"endpoint": f"http://{self.worker_address}"
|
| 163 |
-
})
|
| 164 |
-
logger.info(f"Successfully registered worker {self.worker_id} ({self.worker_address}) with dispatcher")
|
| 165 |
except Exception as e:
|
| 166 |
-
logger.error(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
|
| 168 |
async def ping_dispatcher(self):
|
| 169 |
"""Periodically ping the dispatcher to maintain connection"""
|
|
@@ -718,14 +749,21 @@ async def health_check():
|
|
| 718 |
|
| 719 |
async def startup_worker(worker_address: str, dispatcher_url: str):
|
| 720 |
"""Initialize the worker"""
|
|
|
|
|
|
|
| 721 |
global worker
|
| 722 |
worker = GPUWorker(worker_address, dispatcher_url)
|
|
|
|
| 723 |
|
| 724 |
# Register with dispatcher
|
|
|
|
| 725 |
await worker.register_with_dispatcher()
|
|
|
|
| 726 |
|
| 727 |
# Start ping task
|
|
|
|
| 728 |
asyncio.create_task(worker.ping_dispatcher())
|
|
|
|
| 729 |
|
| 730 |
if __name__ == "__main__":
|
| 731 |
import uvicorn
|
|
@@ -751,7 +789,18 @@ if __name__ == "__main__":
|
|
| 751 |
|
| 752 |
@app.on_event("startup")
|
| 753 |
async def startup_event():
|
|
|
|
| 754 |
await startup_worker(args.worker_address, args.dispatcher_url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 755 |
|
| 756 |
-
|
| 757 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
async def register_with_dispatcher(self):
|
| 156 |
"""Register this worker with the dispatcher"""
|
| 157 |
+
logger.info(f"π Attempting to register with dispatcher at {self.dispatcher_url}")
|
| 158 |
+
logger.info(f"π Worker details: ID={self.worker_id}, Address={self.worker_address}")
|
| 159 |
+
|
| 160 |
+
# Test basic connectivity first
|
| 161 |
+
logger.info(f"π§ͺ Testing basic connectivity to dispatcher...")
|
| 162 |
try:
|
| 163 |
async with aiohttp.ClientSession() as session:
|
| 164 |
+
async with session.get(f"{self.dispatcher_url}/") as response:
|
| 165 |
+
logger.info(f"π Connectivity test successful - dispatcher responded with status {response.status}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
except Exception as e:
|
| 167 |
+
logger.error(f"β Connectivity test FAILED: {e}")
|
| 168 |
+
logger.error(f"π This means the dispatcher is not reachable at {self.dispatcher_url}")
|
| 169 |
+
raise
|
| 170 |
+
|
| 171 |
+
try:
|
| 172 |
+
registration_data = {
|
| 173 |
+
"worker_id": self.worker_id,
|
| 174 |
+
"worker_address": self.worker_address,
|
| 175 |
+
"endpoint": f"http://{self.worker_address}"
|
| 176 |
+
}
|
| 177 |
+
logger.info(f"π€ Sending registration data: {registration_data}")
|
| 178 |
+
|
| 179 |
+
async with aiohttp.ClientSession() as session:
|
| 180 |
+
logger.info(f"π Making POST request to {self.dispatcher_url}/register_worker")
|
| 181 |
+
|
| 182 |
+
async with session.post(f"{self.dispatcher_url}/register_worker", json=registration_data) as response:
|
| 183 |
+
logger.info(f"π₯ Dispatcher response status: {response.status}")
|
| 184 |
+
response_text = await response.text()
|
| 185 |
+
logger.info(f"π₯ Dispatcher response body: {response_text}")
|
| 186 |
+
|
| 187 |
+
if response.status == 200:
|
| 188 |
+
logger.info(f"β
Successfully registered worker {self.worker_id} ({self.worker_address}) with dispatcher")
|
| 189 |
+
else:
|
| 190 |
+
logger.error(f"β Dispatcher returned error status {response.status}: {response_text}")
|
| 191 |
+
|
| 192 |
+
except Exception as e:
|
| 193 |
+
logger.error(f"β Failed to register with dispatcher: {e}")
|
| 194 |
+
logger.error(f"π Exception type: {type(e)}")
|
| 195 |
+
logger.error(f"π Dispatcher URL: {self.dispatcher_url}")
|
| 196 |
+
import traceback
|
| 197 |
+
logger.error(f"π Full traceback: {traceback.format_exc()}")
|
| 198 |
|
| 199 |
async def ping_dispatcher(self):
|
| 200 |
"""Periodically ping the dispatcher to maintain connection"""
|
|
|
|
| 749 |
|
| 750 |
async def startup_worker(worker_address: str, dispatcher_url: str):
|
| 751 |
"""Initialize the worker"""
|
| 752 |
+
logger.info(f"π§ Initializing worker with address {worker_address}")
|
| 753 |
+
|
| 754 |
global worker
|
| 755 |
worker = GPUWorker(worker_address, dispatcher_url)
|
| 756 |
+
logger.info(f"ποΈ Worker object created: {worker.worker_id}")
|
| 757 |
|
| 758 |
# Register with dispatcher
|
| 759 |
+
logger.info(f"π About to register with dispatcher")
|
| 760 |
await worker.register_with_dispatcher()
|
| 761 |
+
logger.info(f"π Registration attempt completed")
|
| 762 |
|
| 763 |
# Start ping task
|
| 764 |
+
logger.info(f"π Starting ping task")
|
| 765 |
asyncio.create_task(worker.ping_dispatcher())
|
| 766 |
+
logger.info(f"β
Worker initialization completed")
|
| 767 |
|
| 768 |
if __name__ == "__main__":
|
| 769 |
import uvicorn
|
|
|
|
| 789 |
|
| 790 |
@app.on_event("startup")
|
| 791 |
async def startup_event():
|
| 792 |
+
logger.info(f"π Worker startup event triggered for {args.worker_address}")
|
| 793 |
await startup_worker(args.worker_address, args.dispatcher_url)
|
| 794 |
+
logger.info(f"β
Worker startup complete for {args.worker_address}")
|
| 795 |
+
|
| 796 |
+
logger.info(f"π Starting worker {args.worker_address} on 0.0.0.0:{port}")
|
| 797 |
+
logger.info(f"π Worker will be available at http://{args.worker_address}")
|
| 798 |
+
logger.info(f"π‘ Will register with dispatcher at {args.dispatcher_url}")
|
| 799 |
|
| 800 |
+
try:
|
| 801 |
+
uvicorn.run(app, host="0.0.0.0", port=port)
|
| 802 |
+
except Exception as e:
|
| 803 |
+
logger.error(f"β Failed to start worker: {e}")
|
| 804 |
+
import traceback
|
| 805 |
+
logger.error(f"π Full traceback: {traceback.format_exc()}")
|
| 806 |
+
raise
|