File size: 1,814 Bytes
678c431
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
@echo off

REM ======================================================================

REM Distributed training launch script for Phi-4 training with torchrun

REM This script launches multi-GPU training on Windows systems

REM ======================================================================



REM Set the number of GPUs to use (defaults to all available)
set NUM_GPUS=%1
if "%NUM_GPUS%"=="" set NUM_GPUS=4

echo.
echo ===== Phi-4 Distributed Training =====
echo.
echo Preparing to launch training with %NUM_GPUS% GPUs...



REM Check if Python is available
where python >nul 2>&1
if %ERRORLEVEL% NEQ 0 (
    echo ERROR: Python not found in PATH. Please make sure Python is installed and in your PATH.
    exit /b 1
)



REM Check if PyTorch is installed by attempting to import it
python -c "import torch" >nul 2>&1
if %ERRORLEVEL% NEQ 0 (
    echo ERROR: PyTorch not properly installed. Please install with:
    echo pip install torch>=2.0.0
    exit /b 1
)



REM Check if torch.distributed is available
python -c "import torch.distributed" >nul 2>&1
if %ERRORLEVEL% NEQ 0 (
    echo ERROR: torch.distributed module not available. Please check your PyTorch installation.
    exit /b 1
)

echo Environment checks passed. Starting distributed training...
echo.



REM Launch the distributed training
python -m torch.distributed.run --nproc_per_node=%NUM_GPUS% --master_port=29500 run_transformers_training.py --config transformers_config.json



REM Check exit status
if %ERRORLEVEL% EQU 0 (
    echo.
    echo ===== SUCCESS =====
    echo Distributed training completed successfully!
) else (
    echo.
    echo ===== ERROR =====
    echo Distributed training failed with exit code %ERRORLEVEL%
)

echo.
echo Training logs are available in the ./results directory.