Spaces:
Runtime error
Runtime error
File size: 2,503 Bytes
fa82766 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
#!/usr/bin/env python3
"""
Script to tail all worker log files simultaneously.
Usage: python tail_workers.py [--num-gpus N]
"""
import argparse
import os
import time
import sys
from typing import Dict
def tail_all_workers(num_gpus: int):
"""Tail all worker log files simultaneously"""
print(f"Tailing logs for {num_gpus} GPU workers...")
print("=" * 60)
# Keep track of file positions
log_positions: Dict[int, int] = {}
for i in range(num_gpus):
log_positions[i] = 0
try:
while True:
has_new_output = False
for i in range(num_gpus):
log_file = f"worker_gpu_{i}.log"
try:
if os.path.exists(log_file):
with open(log_file, 'r') as f:
f.seek(log_positions[i])
new_lines = f.readlines()
if new_lines:
has_new_output = True
for line in new_lines:
timestamp = time.strftime("%H:%M:%S")
print(f"[{timestamp}] [GPU {i}] {line.rstrip()}")
log_positions[i] = f.tell()
else:
# File doesn't exist yet, check if we should show a message
if log_positions[i] == 0:
print(f"[INFO] Waiting for {log_file} to be created...")
log_positions[i] = -1 # Mark as checked
except Exception as e:
print(f"[ERROR] Error reading {log_file}: {e}")
# Only sleep if there was no new output to keep it responsive
if not has_new_output:
time.sleep(0.1)
except KeyboardInterrupt:
print("\nStopping log monitoring...")
def main():
parser = argparse.ArgumentParser(description="Tail all worker log files")
parser.add_argument("--num-gpus", type=int, default=2,
help="Number of GPU workers to monitor (default: 2)")
args = parser.parse_args()
if args.num_gpus < 1:
print("Error: Number of GPUs must be at least 1")
sys.exit(1)
tail_all_workers(args.num_gpus)
if __name__ == "__main__":
main() |