Spaces:
Runtime error
Runtime error
da03
commited on
Commit
·
2ff6d31
1
Parent(s):
c74f490
- start_system.sh +4 -1
- start_workers.py +38 -12
start_system.sh
CHANGED
@@ -131,7 +131,10 @@ done
|
|
131 |
echo ""
|
132 |
echo "📋 Log files:"
|
133 |
echo " Dispatcher: dispatcher.log"
|
134 |
-
echo " Workers: workers.log"
|
|
|
|
|
|
|
135 |
echo ""
|
136 |
echo "Press Ctrl+C to stop the system"
|
137 |
echo "================================"
|
|
|
131 |
echo ""
|
132 |
echo "📋 Log files:"
|
133 |
echo " Dispatcher: dispatcher.log"
|
134 |
+
echo " Workers summary: workers.log"
|
135 |
+
for ((i=0; i<NUM_GPUS; i++)); do
|
136 |
+
echo " GPU $i worker: worker_gpu_$i.log"
|
137 |
+
done
|
138 |
echo ""
|
139 |
echo "Press Ctrl+C to stop the system"
|
140 |
echo "================================"
|
start_workers.py
CHANGED
@@ -34,16 +34,21 @@ class WorkerManager:
|
|
34 |
"--dispatcher-url", self.dispatcher_url
|
35 |
]
|
36 |
|
|
|
|
|
|
|
|
|
|
|
37 |
process = subprocess.Popen(
|
38 |
cmd,
|
39 |
-
stdout=
|
40 |
stderr=subprocess.STDOUT,
|
41 |
universal_newlines=True,
|
42 |
bufsize=1
|
43 |
)
|
44 |
|
45 |
self.processes.append(process)
|
46 |
-
print(f"✓ Started worker {gpu_id} (PID: {process.pid})")
|
47 |
|
48 |
# Small delay between starts
|
49 |
time.sleep(1)
|
@@ -55,6 +60,9 @@ class WorkerManager:
|
|
55 |
|
56 |
print(f"\n✓ All {self.num_gpus} workers started successfully!")
|
57 |
print("Workers are running on ports:", [8001 + i for i in range(self.num_gpus)])
|
|
|
|
|
|
|
58 |
return True
|
59 |
|
60 |
def monitor_workers(self):
|
@@ -62,6 +70,11 @@ class WorkerManager:
|
|
62 |
print("\nMonitoring workers (Ctrl+C to stop)...")
|
63 |
print("-" * 50)
|
64 |
|
|
|
|
|
|
|
|
|
|
|
65 |
try:
|
66 |
while True:
|
67 |
# Check if any process has died
|
@@ -69,16 +82,22 @@ class WorkerManager:
|
|
69 |
if process.poll() is not None:
|
70 |
print(f"⚠️ Worker {i} (PID: {process.pid}) has died!")
|
71 |
# Optionally restart it
|
72 |
-
|
73 |
-
#
|
74 |
-
for i
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
time.sleep(0.1)
|
84 |
|
@@ -104,6 +123,13 @@ class WorkerManager:
|
|
104 |
process.wait()
|
105 |
except Exception as e:
|
106 |
print(f"Error stopping worker {i}: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
|
108 |
print("✓ All workers stopped")
|
109 |
|
|
|
34 |
"--dispatcher-url", self.dispatcher_url
|
35 |
]
|
36 |
|
37 |
+
# Create log file for this worker
|
38 |
+
log_file = f"worker_gpu_{gpu_id}.log"
|
39 |
+
with open(log_file, 'w') as f:
|
40 |
+
f.write(f"Starting worker for GPU {gpu_id}\n")
|
41 |
+
|
42 |
process = subprocess.Popen(
|
43 |
cmd,
|
44 |
+
stdout=open(log_file, 'a'),
|
45 |
stderr=subprocess.STDOUT,
|
46 |
universal_newlines=True,
|
47 |
bufsize=1
|
48 |
)
|
49 |
|
50 |
self.processes.append(process)
|
51 |
+
print(f"✓ Started worker {gpu_id} (PID: {process.pid}) - Log: {log_file}")
|
52 |
|
53 |
# Small delay between starts
|
54 |
time.sleep(1)
|
|
|
60 |
|
61 |
print(f"\n✓ All {self.num_gpus} workers started successfully!")
|
62 |
print("Workers are running on ports:", [8001 + i for i in range(self.num_gpus)])
|
63 |
+
print("Worker log files:")
|
64 |
+
for i in range(self.num_gpus):
|
65 |
+
print(f" GPU {i}: worker_gpu_{i}.log")
|
66 |
return True
|
67 |
|
68 |
def monitor_workers(self):
|
|
|
70 |
print("\nMonitoring workers (Ctrl+C to stop)...")
|
71 |
print("-" * 50)
|
72 |
|
73 |
+
# Keep track of file positions for each log file
|
74 |
+
log_positions = {}
|
75 |
+
for i in range(self.num_gpus):
|
76 |
+
log_positions[i] = 0
|
77 |
+
|
78 |
try:
|
79 |
while True:
|
80 |
# Check if any process has died
|
|
|
82 |
if process.poll() is not None:
|
83 |
print(f"⚠️ Worker {i} (PID: {process.pid}) has died!")
|
84 |
# Optionally restart it
|
85 |
+
|
86 |
+
# Read new lines from log files
|
87 |
+
for i in range(self.num_gpus):
|
88 |
+
log_file = f"worker_gpu_{i}.log"
|
89 |
+
try:
|
90 |
+
if os.path.exists(log_file):
|
91 |
+
with open(log_file, 'r') as f:
|
92 |
+
f.seek(log_positions[i])
|
93 |
+
new_lines = f.readlines()
|
94 |
+
log_positions[i] = f.tell()
|
95 |
+
|
96 |
+
for line in new_lines:
|
97 |
+
print(f"[GPU {i}] {line.strip()}")
|
98 |
+
except Exception as e:
|
99 |
+
# File might be locked or not exist yet
|
100 |
+
pass
|
101 |
|
102 |
time.sleep(0.1)
|
103 |
|
|
|
123 |
process.wait()
|
124 |
except Exception as e:
|
125 |
print(f"Error stopping worker {i}: {e}")
|
126 |
+
|
127 |
+
# Close stdout file handle if it's still open
|
128 |
+
try:
|
129 |
+
if hasattr(process, 'stdout') and process.stdout:
|
130 |
+
process.stdout.close()
|
131 |
+
except:
|
132 |
+
pass
|
133 |
|
134 |
print("✓ All workers stopped")
|
135 |
|