phi4training / run_distributed.bat
George-API's picture
Upload folder using huggingface_hub
678c431 verified
@echo off
REM ======================================================================
REM Distributed training launch script for Phi-4 training with torchrun
REM This script launches multi-GPU training on Windows systems
REM ======================================================================
REM Set the number of GPUs to use (defaults to all available)
set NUM_GPUS=%1
if "%NUM_GPUS%"=="" set NUM_GPUS=4
echo.
echo ===== Phi-4 Distributed Training =====
echo.
echo Preparing to launch training with %NUM_GPUS% GPUs...
REM Check if Python is available
where python >nul 2>&1
if %ERRORLEVEL% NEQ 0 (
echo ERROR: Python not found in PATH. Please make sure Python is installed and in your PATH.
exit /b 1
)
REM Check if PyTorch is installed by attempting to import it
python -c "import torch" >nul 2>&1
if %ERRORLEVEL% NEQ 0 (
echo ERROR: PyTorch not properly installed. Please install with:
echo pip install torch>=2.0.0
exit /b 1
)
REM Check if torch.distributed is available
python -c "import torch.distributed" >nul 2>&1
if %ERRORLEVEL% NEQ 0 (
echo ERROR: torch.distributed module not available. Please check your PyTorch installation.
exit /b 1
)
echo Environment checks passed. Starting distributed training...
echo.
REM Launch the distributed training
python -m torch.distributed.run --nproc_per_node=%NUM_GPUS% --master_port=29500 run_transformers_training.py --config transformers_config.json
REM Check exit status
if %ERRORLEVEL% EQU 0 (
echo.
echo ===== SUCCESS =====
echo Distributed training completed successfully!
) else (
echo.
echo ===== ERROR =====
echo Distributed training failed with exit code %ERRORLEVEL%
)
echo.
echo Training logs are available in the ./results directory.