Spaces:
Sleeping
Sleeping
import os | |
import shlex | |
import torch | |
from autotrain import logger | |
from autotrain.trainers.clm.params import LLMTrainingParams | |
from autotrain.trainers.extractive_question_answering.params import ExtractiveQuestionAnsweringParams | |
from autotrain.trainers.generic.params import GenericParams | |
from autotrain.trainers.image_classification.params import ImageClassificationParams | |
from autotrain.trainers.image_regression.params import ImageRegressionParams | |
from autotrain.trainers.object_detection.params import ObjectDetectionParams | |
from autotrain.trainers.sent_transformers.params import SentenceTransformersParams | |
from autotrain.trainers.seq2seq.params import Seq2SeqParams | |
from autotrain.trainers.tabular.params import TabularParams | |
from autotrain.trainers.text_classification.params import TextClassificationParams | |
from autotrain.trainers.text_regression.params import TextRegressionParams | |
from autotrain.trainers.token_classification.params import TokenClassificationParams | |
from autotrain.trainers.vlm.params import VLMTrainingParams | |
CPU_COMMAND = [ | |
"accelerate", | |
"launch", | |
"--cpu", | |
] | |
SINGLE_GPU_COMMAND = [ | |
"accelerate", | |
"launch", | |
"--num_machines", | |
"1", | |
"--num_processes", | |
"1", | |
] | |
def get_accelerate_command(num_gpus, gradient_accumulation_steps=1, distributed_backend=None): | |
""" | |
Generates the appropriate command to launch a training job using the `accelerate` library based on the number of GPUs | |
and the specified distributed backend. | |
Args: | |
num_gpus (int): The number of GPUs available for training. If 0, training will be forced on CPU. | |
gradient_accumulation_steps (int, optional): The number of gradient accumulation steps. Defaults to 1. | |
distributed_backend (str, optional): The distributed backend to use. Can be "ddp" (Distributed Data Parallel), | |
"deepspeed", or None. Defaults to None. | |
Returns: | |
list or str: The command to be executed as a list of strings. If no GPU is found, returns a CPU command string. | |
If a single GPU is found, returns a single GPU command string. Otherwise, returns a list of | |
command arguments for multi-GPU or DeepSpeed training. | |
Raises: | |
ValueError: If an unsupported distributed backend is specified. | |
""" | |
if num_gpus == 0: | |
logger.warning("No GPU found. Forcing training on CPU. This will be super slow!") | |
return CPU_COMMAND | |
if num_gpus == 1: | |
return SINGLE_GPU_COMMAND | |
if distributed_backend in ("ddp", None): | |
return [ | |
"accelerate", | |
"launch", | |
"--multi_gpu", | |
"--num_machines", | |
"1", | |
"--num_processes", | |
str(num_gpus), | |
] | |
elif distributed_backend == "deepspeed": | |
return [ | |
"accelerate", | |
"launch", | |
"--use_deepspeed", | |
"--zero_stage", | |
"3", | |
"--offload_optimizer_device", | |
"none", | |
"--offload_param_device", | |
"none", | |
"--zero3_save_16bit_model", | |
"true", | |
"--zero3_init_flag", | |
"true", | |
"--deepspeed_multinode_launcher", | |
"standard", | |
"--gradient_accumulation_steps", | |
str(gradient_accumulation_steps), | |
] | |
else: | |
raise ValueError("Unsupported distributed backend") | |
def launch_command(params): | |
""" | |
Launches the appropriate training command based on the type of training parameters provided. | |
Args: | |
params (object): An instance of one of the training parameter classes. This can be one of the following: | |
- LLMTrainingParams | |
- GenericParams | |
- TabularParams | |
- TextClassificationParams | |
- TextRegressionParams | |
- SentenceTransformersParams | |
- ExtractiveQuestionAnsweringParams | |
- TokenClassificationParams | |
- ImageClassificationParams | |
- ObjectDetectionParams | |
- ImageRegressionParams | |
- Seq2SeqParams | |
- VLMTrainingParams | |
Returns: | |
list: A list of command line arguments to be executed for training. | |
Raises: | |
ValueError: If the provided params type is unsupported. | |
""" | |
params.project_name = shlex.split(params.project_name)[0] | |
cuda_available = torch.cuda.is_available() | |
mps_available = torch.backends.mps.is_available() | |
if cuda_available: | |
num_gpus = torch.cuda.device_count() | |
elif mps_available: | |
num_gpus = 1 | |
else: | |
num_gpus = 0 | |
if isinstance(params, LLMTrainingParams): | |
cmd = get_accelerate_command(num_gpus, params.gradient_accumulation, params.distributed_backend) | |
if num_gpus > 0: | |
cmd.append("--mixed_precision") | |
if params.mixed_precision == "fp16": | |
cmd.append("fp16") | |
elif params.mixed_precision == "bf16": | |
cmd.append("bf16") | |
else: | |
cmd.append("no") | |
cmd.extend( | |
[ | |
"-m", | |
"autotrain.trainers.clm", | |
"--training_config", | |
os.path.join(params.project_name, "training_params.json"), | |
] | |
) | |
elif isinstance(params, GenericParams): | |
cmd = [ | |
"python", | |
"-m", | |
"autotrain.trainers.generic", | |
"--config", | |
os.path.join(params.project_name, "training_params.json"), | |
] | |
elif isinstance(params, TabularParams): | |
cmd = [ | |
"python", | |
"-m", | |
"autotrain.trainers.tabular", | |
"--training_config", | |
os.path.join(params.project_name, "training_params.json"), | |
] | |
elif ( | |
isinstance(params, TextClassificationParams) | |
or isinstance(params, TextRegressionParams) | |
or isinstance(params, SentenceTransformersParams) | |
or isinstance(params, ExtractiveQuestionAnsweringParams) | |
): | |
if num_gpus == 0: | |
cmd = [ | |
"accelerate", | |
"launch", | |
"--cpu", | |
] | |
elif num_gpus == 1: | |
cmd = [ | |
"accelerate", | |
"launch", | |
"--num_machines", | |
"1", | |
"--num_processes", | |
"1", | |
] | |
else: | |
cmd = [ | |
"accelerate", | |
"launch", | |
"--multi_gpu", | |
"--num_machines", | |
"1", | |
"--num_processes", | |
str(num_gpus), | |
] | |
if num_gpus > 0: | |
cmd.append("--mixed_precision") | |
if params.mixed_precision == "fp16": | |
cmd.append("fp16") | |
elif params.mixed_precision == "bf16": | |
cmd.append("bf16") | |
else: | |
cmd.append("no") | |
if isinstance(params, TextRegressionParams): | |
cmd.extend( | |
[ | |
"-m", | |
"autotrain.trainers.text_regression", | |
"--training_config", | |
os.path.join(params.project_name, "training_params.json"), | |
] | |
) | |
elif isinstance(params, SentenceTransformersParams): | |
cmd.extend( | |
[ | |
"-m", | |
"autotrain.trainers.sent_transformers", | |
"--training_config", | |
os.path.join(params.project_name, "training_params.json"), | |
] | |
) | |
elif isinstance(params, ExtractiveQuestionAnsweringParams): | |
cmd.extend( | |
[ | |
"-m", | |
"autotrain.trainers.extractive_question_answering", | |
"--training_config", | |
os.path.join(params.project_name, "training_params.json"), | |
] | |
) | |
else: | |
cmd.extend( | |
[ | |
"-m", | |
"autotrain.trainers.text_classification", | |
"--training_config", | |
os.path.join(params.project_name, "training_params.json"), | |
] | |
) | |
elif isinstance(params, TokenClassificationParams): | |
if num_gpus == 0: | |
cmd = [ | |
"accelerate", | |
"launch", | |
"--cpu", | |
] | |
elif num_gpus == 1: | |
cmd = [ | |
"accelerate", | |
"launch", | |
"--num_machines", | |
"1", | |
"--num_processes", | |
"1", | |
] | |
else: | |
cmd = [ | |
"accelerate", | |
"launch", | |
"--multi_gpu", | |
"--num_machines", | |
"1", | |
"--num_processes", | |
str(num_gpus), | |
] | |
if num_gpus > 0: | |
cmd.append("--mixed_precision") | |
if params.mixed_precision == "fp16": | |
cmd.append("fp16") | |
elif params.mixed_precision == "bf16": | |
cmd.append("bf16") | |
else: | |
cmd.append("no") | |
cmd.extend( | |
[ | |
"-m", | |
"autotrain.trainers.token_classification", | |
"--training_config", | |
os.path.join(params.project_name, "training_params.json"), | |
] | |
) | |
elif ( | |
isinstance(params, ImageClassificationParams) | |
or isinstance(params, ObjectDetectionParams) | |
or isinstance(params, ImageRegressionParams) | |
): | |
if num_gpus == 0: | |
cmd = [ | |
"accelerate", | |
"launch", | |
"--cpu", | |
] | |
elif num_gpus == 1: | |
cmd = [ | |
"accelerate", | |
"launch", | |
"--num_machines", | |
"1", | |
"--num_processes", | |
"1", | |
] | |
else: | |
cmd = [ | |
"accelerate", | |
"launch", | |
"--multi_gpu", | |
"--num_machines", | |
"1", | |
"--num_processes", | |
str(num_gpus), | |
] | |
if num_gpus > 0: | |
cmd.append("--mixed_precision") | |
if params.mixed_precision == "fp16": | |
cmd.append("fp16") | |
elif params.mixed_precision == "bf16": | |
cmd.append("bf16") | |
else: | |
cmd.append("no") | |
if isinstance(params, ObjectDetectionParams): | |
cmd.extend( | |
[ | |
"-m", | |
"autotrain.trainers.object_detection", | |
"--training_config", | |
os.path.join(params.project_name, "training_params.json"), | |
] | |
) | |
elif isinstance(params, ImageRegressionParams): | |
cmd.extend( | |
[ | |
"-m", | |
"autotrain.trainers.image_regression", | |
"--training_config", | |
os.path.join(params.project_name, "training_params.json"), | |
] | |
) | |
else: | |
cmd.extend( | |
[ | |
"-m", | |
"autotrain.trainers.image_classification", | |
"--training_config", | |
os.path.join(params.project_name, "training_params.json"), | |
] | |
) | |
elif isinstance(params, Seq2SeqParams): | |
if num_gpus == 0: | |
logger.warning("No GPU found. Forcing training on CPU. This will be super slow!") | |
cmd = [ | |
"accelerate", | |
"launch", | |
"--cpu", | |
] | |
elif num_gpus == 1: | |
cmd = [ | |
"accelerate", | |
"launch", | |
"--num_machines", | |
"1", | |
"--num_processes", | |
"1", | |
] | |
elif num_gpus == 2: | |
cmd = [ | |
"accelerate", | |
"launch", | |
"--multi_gpu", | |
"--num_machines", | |
"1", | |
"--num_processes", | |
"2", | |
] | |
else: | |
if params.quantization in ("int8", "int4") and params.peft and params.mixed_precision == "bf16": | |
cmd = [ | |
"accelerate", | |
"launch", | |
"--multi_gpu", | |
"--num_machines", | |
"1", | |
"--num_processes", | |
str(num_gpus), | |
] | |
else: | |
cmd = [ | |
"accelerate", | |
"launch", | |
"--use_deepspeed", | |
"--zero_stage", | |
"3", | |
"--offload_optimizer_device", | |
"none", | |
"--offload_param_device", | |
"none", | |
"--zero3_save_16bit_model", | |
"true", | |
"--zero3_init_flag", | |
"true", | |
"--deepspeed_multinode_launcher", | |
"standard", | |
"--gradient_accumulation_steps", | |
str(params.gradient_accumulation), | |
] | |
if num_gpus > 0: | |
cmd.append("--mixed_precision") | |
if params.mixed_precision == "fp16": | |
cmd.append("fp16") | |
elif params.mixed_precision == "bf16": | |
cmd.append("bf16") | |
else: | |
cmd.append("no") | |
cmd.extend( | |
[ | |
"-m", | |
"autotrain.trainers.seq2seq", | |
"--training_config", | |
os.path.join(params.project_name, "training_params.json"), | |
] | |
) | |
elif isinstance(params, VLMTrainingParams): | |
if num_gpus == 0: | |
logger.warning("No GPU found. Forcing training on CPU. This will be super slow!") | |
cmd = [ | |
"accelerate", | |
"launch", | |
"--cpu", | |
] | |
elif num_gpus == 1: | |
cmd = [ | |
"accelerate", | |
"launch", | |
"--num_machines", | |
"1", | |
"--num_processes", | |
"1", | |
] | |
elif num_gpus == 2: | |
cmd = [ | |
"accelerate", | |
"launch", | |
"--multi_gpu", | |
"--num_machines", | |
"1", | |
"--num_processes", | |
"2", | |
] | |
else: | |
if params.quantization in ("int8", "int4") and params.peft and params.mixed_precision == "bf16": | |
cmd = [ | |
"accelerate", | |
"launch", | |
"--multi_gpu", | |
"--num_machines", | |
"1", | |
"--num_processes", | |
str(num_gpus), | |
] | |
else: | |
cmd = [ | |
"accelerate", | |
"launch", | |
"--use_deepspeed", | |
"--zero_stage", | |
"3", | |
"--offload_optimizer_device", | |
"none", | |
"--offload_param_device", | |
"none", | |
"--zero3_save_16bit_model", | |
"true", | |
"--zero3_init_flag", | |
"true", | |
"--deepspeed_multinode_launcher", | |
"standard", | |
"--gradient_accumulation_steps", | |
str(params.gradient_accumulation), | |
] | |
if num_gpus > 0: | |
cmd.append("--mixed_precision") | |
if params.mixed_precision == "fp16": | |
cmd.append("fp16") | |
elif params.mixed_precision == "bf16": | |
cmd.append("bf16") | |
else: | |
cmd.append("no") | |
cmd.extend( | |
[ | |
"-m", | |
"autotrain.trainers.vlm", | |
"--training_config", | |
os.path.join(params.project_name, "training_params.json"), | |
] | |
) | |
else: | |
raise ValueError("Unsupported params type") | |
logger.info(cmd) | |
logger.info(params) | |
return cmd | |