@echo off
REM ======================================================================
REM Distributed training launch script for Phi-4 training with torchrun
REM This script launches multi-GPU training on Windows systems
REM ======================================================================

REM Set the number of GPUs to use (defaults to all available)
set NUM_GPUS=%1
if "%NUM_GPUS%"=="" set NUM_GPUS=4

echo.
echo ===== Phi-4 Distributed Training =====
echo.
echo Preparing to launch training with %NUM_GPUS% GPUs...

REM Check if Python is available
where python >nul 2>&1
if %ERRORLEVEL% NEQ 0 (
    echo ERROR: Python not found in PATH. Please make sure Python is installed and in your PATH.
    exit /b 1
)

REM Check if PyTorch is installed by attempting to import it
python -c "import torch" >nul 2>&1
if %ERRORLEVEL% NEQ 0 (
    echo ERROR: PyTorch not properly installed. Please install with:
    echo pip install torch>=2.0.0
    exit /b 1
)

REM Check if torch.distributed is available
python -c "import torch.distributed" >nul 2>&1
if %ERRORLEVEL% NEQ 0 (
    echo ERROR: torch.distributed module not available. Please check your PyTorch installation.
    exit /b 1
)

echo Environment checks passed. Starting distributed training...
echo.

REM Launch the distributed training
python -m torch.distributed.run --nproc_per_node=%NUM_GPUS% --master_port=29500 run_transformers_training.py --config transformers_config.json

REM Check exit status
if %ERRORLEVEL% EQU 0 (
    echo.
    echo ===== SUCCESS =====
    echo Distributed training completed successfully!
) else (
    echo.
    echo ===== ERROR =====
    echo Distributed training failed with exit code %ERRORLEVEL%
)

echo.
echo Training logs are available in the ./results directory.