Spaces:

hardiktiwari
/

tensora-autotrain

Sleeping

File size: 16,561 Bytes

33d4721

import os
import shlex

import torch

from autotrain import logger
from autotrain.trainers.clm.params import LLMTrainingParams
from autotrain.trainers.extractive_question_answering.params import ExtractiveQuestionAnsweringParams
from autotrain.trainers.generic.params import GenericParams
from autotrain.trainers.image_classification.params import ImageClassificationParams
from autotrain.trainers.image_regression.params import ImageRegressionParams
from autotrain.trainers.object_detection.params import ObjectDetectionParams
from autotrain.trainers.sent_transformers.params import SentenceTransformersParams
from autotrain.trainers.seq2seq.params import Seq2SeqParams
from autotrain.trainers.tabular.params import TabularParams
from autotrain.trainers.text_classification.params import TextClassificationParams
from autotrain.trainers.text_regression.params import TextRegressionParams
from autotrain.trainers.token_classification.params import TokenClassificationParams
from autotrain.trainers.vlm.params import VLMTrainingParams


CPU_COMMAND = [
    "accelerate",
    "launch",
    "--cpu",
]

SINGLE_GPU_COMMAND = [
    "accelerate",
    "launch",
    "--num_machines",
    "1",
    "--num_processes",
    "1",
]


def get_accelerate_command(num_gpus, gradient_accumulation_steps=1, distributed_backend=None):
    """
    Generates the appropriate command to launch a training job using the `accelerate` library based on the number of GPUs
    and the specified distributed backend.

    Args:
        num_gpus (int): The number of GPUs available for training. If 0, training will be forced on CPU.
        gradient_accumulation_steps (int, optional): The number of gradient accumulation steps. Defaults to 1.
        distributed_backend (str, optional): The distributed backend to use. Can be "ddp" (Distributed Data Parallel),
                                             "deepspeed", or None. Defaults to None.

    Returns:
        list or str: The command to be executed as a list of strings. If no GPU is found, returns a CPU command string.
                     If a single GPU is found, returns a single GPU command string. Otherwise, returns a list of
                     command arguments for multi-GPU or DeepSpeed training.

    Raises:
        ValueError: If an unsupported distributed backend is specified.
    """
    if num_gpus == 0:
        logger.warning("No GPU found. Forcing training on CPU. This will be super slow!")
        return CPU_COMMAND

    if num_gpus == 1:
        return SINGLE_GPU_COMMAND

    if distributed_backend in ("ddp", None):
        return [
            "accelerate",
            "launch",
            "--multi_gpu",
            "--num_machines",
            "1",
            "--num_processes",
            str(num_gpus),
        ]
    elif distributed_backend == "deepspeed":
        return [
            "accelerate",
            "launch",
            "--use_deepspeed",
            "--zero_stage",
            "3",
            "--offload_optimizer_device",
            "none",
            "--offload_param_device",
            "none",
            "--zero3_save_16bit_model",
            "true",
            "--zero3_init_flag",
            "true",
            "--deepspeed_multinode_launcher",
            "standard",
            "--gradient_accumulation_steps",
            str(gradient_accumulation_steps),
        ]
    else:
        raise ValueError("Unsupported distributed backend")


def launch_command(params):
    """
    Launches the appropriate training command based on the type of training parameters provided.

    Args:
        params (object): An instance of one of the training parameter classes. This can be one of the following:
            - LLMTrainingParams
            - GenericParams
            - TabularParams
            - TextClassificationParams
            - TextRegressionParams
            - SentenceTransformersParams
            - ExtractiveQuestionAnsweringParams
            - TokenClassificationParams
            - ImageClassificationParams
            - ObjectDetectionParams
            - ImageRegressionParams
            - Seq2SeqParams
            - VLMTrainingParams

    Returns:
        list: A list of command line arguments to be executed for training.

    Raises:
        ValueError: If the provided params type is unsupported.
    """

    params.project_name = shlex.split(params.project_name)[0]
    cuda_available = torch.cuda.is_available()
    mps_available = torch.backends.mps.is_available()
    if cuda_available:
        num_gpus = torch.cuda.device_count()
    elif mps_available:
        num_gpus = 1
    else:
        num_gpus = 0
    if isinstance(params, LLMTrainingParams):
        cmd = get_accelerate_command(num_gpus, params.gradient_accumulation, params.distributed_backend)
        if num_gpus > 0:
            cmd.append("--mixed_precision")
            if params.mixed_precision == "fp16":
                cmd.append("fp16")
            elif params.mixed_precision == "bf16":
                cmd.append("bf16")
            else:
                cmd.append("no")

        cmd.extend(
            [
                "-m",
                "autotrain.trainers.clm",
                "--training_config",
                os.path.join(params.project_name, "training_params.json"),
            ]
        )

    elif isinstance(params, GenericParams):
        cmd = [
            "python",
            "-m",
            "autotrain.trainers.generic",
            "--config",
            os.path.join(params.project_name, "training_params.json"),
        ]
    elif isinstance(params, TabularParams):
        cmd = [
            "python",
            "-m",
            "autotrain.trainers.tabular",
            "--training_config",
            os.path.join(params.project_name, "training_params.json"),
        ]
    elif (
        isinstance(params, TextClassificationParams)
        or isinstance(params, TextRegressionParams)
        or isinstance(params, SentenceTransformersParams)
        or isinstance(params, ExtractiveQuestionAnsweringParams)
    ):
        if num_gpus == 0:
            cmd = [
                "accelerate",
                "launch",
                "--cpu",
            ]
        elif num_gpus == 1:
            cmd = [
                "accelerate",
                "launch",
                "--num_machines",
                "1",
                "--num_processes",
                "1",
            ]
        else:
            cmd = [
                "accelerate",
                "launch",
                "--multi_gpu",
                "--num_machines",
                "1",
                "--num_processes",
                str(num_gpus),
            ]

        if num_gpus > 0:
            cmd.append("--mixed_precision")
            if params.mixed_precision == "fp16":
                cmd.append("fp16")
            elif params.mixed_precision == "bf16":
                cmd.append("bf16")
            else:
                cmd.append("no")

        if isinstance(params, TextRegressionParams):
            cmd.extend(
                [
                    "-m",
                    "autotrain.trainers.text_regression",
                    "--training_config",
                    os.path.join(params.project_name, "training_params.json"),
                ]
            )
        elif isinstance(params, SentenceTransformersParams):
            cmd.extend(
                [
                    "-m",
                    "autotrain.trainers.sent_transformers",
                    "--training_config",
                    os.path.join(params.project_name, "training_params.json"),
                ]
            )
        elif isinstance(params, ExtractiveQuestionAnsweringParams):
            cmd.extend(
                [
                    "-m",
                    "autotrain.trainers.extractive_question_answering",
                    "--training_config",
                    os.path.join(params.project_name, "training_params.json"),
                ]
            )
        else:
            cmd.extend(
                [
                    "-m",
                    "autotrain.trainers.text_classification",
                    "--training_config",
                    os.path.join(params.project_name, "training_params.json"),
                ]
            )
    elif isinstance(params, TokenClassificationParams):
        if num_gpus == 0:
            cmd = [
                "accelerate",
                "launch",
                "--cpu",
            ]
        elif num_gpus == 1:
            cmd = [
                "accelerate",
                "launch",
                "--num_machines",
                "1",
                "--num_processes",
                "1",
            ]
        else:
            cmd = [
                "accelerate",
                "launch",
                "--multi_gpu",
                "--num_machines",
                "1",
                "--num_processes",
                str(num_gpus),
            ]

        if num_gpus > 0:
            cmd.append("--mixed_precision")
            if params.mixed_precision == "fp16":
                cmd.append("fp16")
            elif params.mixed_precision == "bf16":
                cmd.append("bf16")
            else:
                cmd.append("no")

        cmd.extend(
            [
                "-m",
                "autotrain.trainers.token_classification",
                "--training_config",
                os.path.join(params.project_name, "training_params.json"),
            ]
        )
    elif (
        isinstance(params, ImageClassificationParams)
        or isinstance(params, ObjectDetectionParams)
        or isinstance(params, ImageRegressionParams)
    ):
        if num_gpus == 0:
            cmd = [
                "accelerate",
                "launch",
                "--cpu",
            ]
        elif num_gpus == 1:
            cmd = [
                "accelerate",
                "launch",
                "--num_machines",
                "1",
                "--num_processes",
                "1",
            ]
        else:
            cmd = [
                "accelerate",
                "launch",
                "--multi_gpu",
                "--num_machines",
                "1",
                "--num_processes",
                str(num_gpus),
            ]

        if num_gpus > 0:
            cmd.append("--mixed_precision")
            if params.mixed_precision == "fp16":
                cmd.append("fp16")
            elif params.mixed_precision == "bf16":
                cmd.append("bf16")
            else:
                cmd.append("no")

        if isinstance(params, ObjectDetectionParams):
            cmd.extend(
                [
                    "-m",
                    "autotrain.trainers.object_detection",
                    "--training_config",
                    os.path.join(params.project_name, "training_params.json"),
                ]
            )
        elif isinstance(params, ImageRegressionParams):
            cmd.extend(
                [
                    "-m",
                    "autotrain.trainers.image_regression",
                    "--training_config",
                    os.path.join(params.project_name, "training_params.json"),
                ]
            )
        else:
            cmd.extend(
                [
                    "-m",
                    "autotrain.trainers.image_classification",
                    "--training_config",
                    os.path.join(params.project_name, "training_params.json"),
                ]
            )
    elif isinstance(params, Seq2SeqParams):
        if num_gpus == 0:
            logger.warning("No GPU found. Forcing training on CPU. This will be super slow!")
            cmd = [
                "accelerate",
                "launch",
                "--cpu",
            ]
        elif num_gpus == 1:
            cmd = [
                "accelerate",
                "launch",
                "--num_machines",
                "1",
                "--num_processes",
                "1",
            ]
        elif num_gpus == 2:
            cmd = [
                "accelerate",
                "launch",
                "--multi_gpu",
                "--num_machines",
                "1",
                "--num_processes",
                "2",
            ]
        else:
            if params.quantization in ("int8", "int4") and params.peft and params.mixed_precision == "bf16":
                cmd = [
                    "accelerate",
                    "launch",
                    "--multi_gpu",
                    "--num_machines",
                    "1",
                    "--num_processes",
                    str(num_gpus),
                ]
            else:
                cmd = [
                    "accelerate",
                    "launch",
                    "--use_deepspeed",
                    "--zero_stage",
                    "3",
                    "--offload_optimizer_device",
                    "none",
                    "--offload_param_device",
                    "none",
                    "--zero3_save_16bit_model",
                    "true",
                    "--zero3_init_flag",
                    "true",
                    "--deepspeed_multinode_launcher",
                    "standard",
                    "--gradient_accumulation_steps",
                    str(params.gradient_accumulation),
                ]
        if num_gpus > 0:
            cmd.append("--mixed_precision")
            if params.mixed_precision == "fp16":
                cmd.append("fp16")
            elif params.mixed_precision == "bf16":
                cmd.append("bf16")
            else:
                cmd.append("no")

        cmd.extend(
            [
                "-m",
                "autotrain.trainers.seq2seq",
                "--training_config",
                os.path.join(params.project_name, "training_params.json"),
            ]
        )

    elif isinstance(params, VLMTrainingParams):
        if num_gpus == 0:
            logger.warning("No GPU found. Forcing training on CPU. This will be super slow!")
            cmd = [
                "accelerate",
                "launch",
                "--cpu",
            ]
        elif num_gpus == 1:
            cmd = [
                "accelerate",
                "launch",
                "--num_machines",
                "1",
                "--num_processes",
                "1",
            ]
        elif num_gpus == 2:
            cmd = [
                "accelerate",
                "launch",
                "--multi_gpu",
                "--num_machines",
                "1",
                "--num_processes",
                "2",
            ]
        else:
            if params.quantization in ("int8", "int4") and params.peft and params.mixed_precision == "bf16":
                cmd = [
                    "accelerate",
                    "launch",
                    "--multi_gpu",
                    "--num_machines",
                    "1",
                    "--num_processes",
                    str(num_gpus),
                ]
            else:
                cmd = [
                    "accelerate",
                    "launch",
                    "--use_deepspeed",
                    "--zero_stage",
                    "3",
                    "--offload_optimizer_device",
                    "none",
                    "--offload_param_device",
                    "none",
                    "--zero3_save_16bit_model",
                    "true",
                    "--zero3_init_flag",
                    "true",
                    "--deepspeed_multinode_launcher",
                    "standard",
                    "--gradient_accumulation_steps",
                    str(params.gradient_accumulation),
                ]

        if num_gpus > 0:
            cmd.append("--mixed_precision")
            if params.mixed_precision == "fp16":
                cmd.append("fp16")
            elif params.mixed_precision == "bf16":
                cmd.append("bf16")
            else:
                cmd.append("no")

        cmd.extend(
            [
                "-m",
                "autotrain.trainers.vlm",
                "--training_config",
                os.path.join(params.project_name, "training_params.json"),
            ]
        )

    else:
        raise ValueError("Unsupported params type")

    logger.info(cmd)
    logger.info(params)
    return cmd