@echo off REM ====================================================================== REM Distributed training launch script for Phi-4 training with torchrun REM This script launches multi-GPU training on Windows systems REM ====================================================================== REM Set the number of GPUs to use (defaults to all available) set NUM_GPUS=%1 if "%NUM_GPUS%"=="" set NUM_GPUS=4 echo. echo ===== Phi-4 Distributed Training ===== echo. echo Preparing to launch training with %NUM_GPUS% GPUs... REM Check if Python is available where python >nul 2>&1 if %ERRORLEVEL% NEQ 0 ( echo ERROR: Python not found in PATH. Please make sure Python is installed and in your PATH. exit /b 1 ) REM Check if PyTorch is installed by attempting to import it python -c "import torch" >nul 2>&1 if %ERRORLEVEL% NEQ 0 ( echo ERROR: PyTorch not properly installed. Please install with: echo pip install torch>=2.0.0 exit /b 1 ) REM Check if torch.distributed is available python -c "import torch.distributed" >nul 2>&1 if %ERRORLEVEL% NEQ 0 ( echo ERROR: torch.distributed module not available. Please check your PyTorch installation. exit /b 1 ) echo Environment checks passed. Starting distributed training... echo. REM Launch the distributed training python -m torch.distributed.run --nproc_per_node=%NUM_GPUS% --master_port=29500 run_transformers_training.py --config transformers_config.json REM Check exit status if %ERRORLEVEL% EQU 0 ( echo. echo ===== SUCCESS ===== echo Distributed training completed successfully! ) else ( echo. echo ===== ERROR ===== echo Distributed training failed with exit code %ERRORLEVEL% ) echo. echo Training logs are available in the ./results directory.