Spaces:
Sleeping
Sleeping
@echo off | |
REM ====================================================================== | |
REM Distributed training launch script for Phi-4 training with torchrun | |
REM This script launches multi-GPU training on Windows systems | |
REM ====================================================================== | |
REM Set the number of GPUs to use (defaults to all available) | |
set NUM_GPUS=%1 | |
if "%NUM_GPUS%"=="" set NUM_GPUS=4 | |
echo. | |
echo ===== Phi-4 Distributed Training ===== | |
echo. | |
echo Preparing to launch training with %NUM_GPUS% GPUs... | |
REM Check if Python is available | |
where python >nul 2>&1 | |
if %ERRORLEVEL% NEQ 0 ( | |
echo ERROR: Python not found in PATH. Please make sure Python is installed and in your PATH. | |
exit /b 1 | |
) | |
REM Check if PyTorch is installed by attempting to import it | |
python -c "import torch" >nul 2>&1 | |
if %ERRORLEVEL% NEQ 0 ( | |
echo ERROR: PyTorch not properly installed. Please install with: | |
echo pip install torch>=2.0.0 | |
exit /b 1 | |
) | |
REM Check if torch.distributed is available | |
python -c "import torch.distributed" >nul 2>&1 | |
if %ERRORLEVEL% NEQ 0 ( | |
echo ERROR: torch.distributed module not available. Please check your PyTorch installation. | |
exit /b 1 | |
) | |
echo Environment checks passed. Starting distributed training... | |
echo. | |
REM Launch the distributed training | |
python -m torch.distributed.run --nproc_per_node=%NUM_GPUS% --master_port=29500 run_transformers_training.py --config transformers_config.json | |
REM Check exit status | |
if %ERRORLEVEL% EQU 0 ( | |
echo. | |
echo ===== SUCCESS ===== | |
echo Distributed training completed successfully! | |
) else ( | |
echo. | |
echo ===== ERROR ===== | |
echo Distributed training failed with exit code %ERRORLEVEL% | |
) | |
echo. | |
echo Training logs are available in the ./results directory. |