Spaces:
Sleeping
Sleeping
File size: 1,814 Bytes
678c431 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
@echo off
REM ======================================================================
REM Distributed training launch script for Phi-4 training with torchrun
REM This script launches multi-GPU training on Windows systems
REM ======================================================================
REM Set the number of GPUs to use (defaults to all available)
set NUM_GPUS=%1
if "%NUM_GPUS%"=="" set NUM_GPUS=4
echo.
echo ===== Phi-4 Distributed Training =====
echo.
echo Preparing to launch training with %NUM_GPUS% GPUs...
REM Check if Python is available
where python >nul 2>&1
if %ERRORLEVEL% NEQ 0 (
echo ERROR: Python not found in PATH. Please make sure Python is installed and in your PATH.
exit /b 1
)
REM Check if PyTorch is installed by attempting to import it
python -c "import torch" >nul 2>&1
if %ERRORLEVEL% NEQ 0 (
echo ERROR: PyTorch not properly installed. Please install with:
echo pip install torch>=2.0.0
exit /b 1
)
REM Check if torch.distributed is available
python -c "import torch.distributed" >nul 2>&1
if %ERRORLEVEL% NEQ 0 (
echo ERROR: torch.distributed module not available. Please check your PyTorch installation.
exit /b 1
)
echo Environment checks passed. Starting distributed training...
echo.
REM Launch the distributed training
python -m torch.distributed.run --nproc_per_node=%NUM_GPUS% --master_port=29500 run_transformers_training.py --config transformers_config.json
REM Check exit status
if %ERRORLEVEL% EQU 0 (
echo.
echo ===== SUCCESS =====
echo Distributed training completed successfully!
) else (
echo.
echo ===== ERROR =====
echo Distributed training failed with exit code %ERRORLEVEL%
)
echo.
echo Training logs are available in the ./results directory. |