#!/usr/bin/env python3 """ Script to tail all worker log files simultaneously. Usage: python tail_workers.py [--num-gpus N] """ import argparse import os import time import sys from typing import Dict def tail_all_workers(num_gpus: int): """Tail all worker log files simultaneously""" print(f"Tailing logs for {num_gpus} GPU workers...") print("=" * 60) # Keep track of file positions log_positions: Dict[int, int] = {} for i in range(num_gpus): log_positions[i] = 0 try: while True: has_new_output = False for i in range(num_gpus): log_file = f"worker_gpu_{i}.log" try: if os.path.exists(log_file): with open(log_file, 'r') as f: f.seek(log_positions[i]) new_lines = f.readlines() if new_lines: has_new_output = True for line in new_lines: timestamp = time.strftime("%H:%M:%S") print(f"[{timestamp}] [GPU {i}] {line.rstrip()}") log_positions[i] = f.tell() else: # File doesn't exist yet, check if we should show a message if log_positions[i] == 0: print(f"[INFO] Waiting for {log_file} to be created...") log_positions[i] = -1 # Mark as checked except Exception as e: print(f"[ERROR] Error reading {log_file}: {e}") # Only sleep if there was no new output to keep it responsive if not has_new_output: time.sleep(0.1) except KeyboardInterrupt: print("\nStopping log monitoring...") def main(): parser = argparse.ArgumentParser(description="Tail all worker log files") parser.add_argument("--num-gpus", type=int, default=2, help="Number of GPU workers to monitor (default: 2)") args = parser.parse_args() if args.num_gpus < 1: print("Error: Number of GPUs must be at least 1") sys.exit(1) tail_all_workers(args.num_gpus) if __name__ == "__main__": main()