File size: 9,062 Bytes
33d4721
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
from typing import List, Optional, Union

from pydantic import Field

from autotrain.trainers.common import AutoTrainParams


class LLMTrainingParams(AutoTrainParams):
    """
    LLMTrainingParams: Parameters for training a language model using the autotrain library.

    Attributes:
        model (str): Model name to be used for training. Default is "gpt2".
        project_name (str): Name of the project and output directory. Default is "project-name".

        data_path (str): Path to the dataset. Default is "data".
        train_split (str): Configuration for the training data split. Default is "train".
        valid_split (Optional[str]): Configuration for the validation data split. Default is None.
        add_eos_token (bool): Whether to add an EOS token at the end of sequences. Default is True.
        block_size (Union[int, List[int]]): Size of the blocks for training, can be a single integer or a list of integers. Default is -1.
        model_max_length (int): Maximum length of the model input. Default is 2048.
        padding (Optional[str]): Side on which to pad sequences (left or right). Default is "right".

        trainer (str): Type of trainer to use. Default is "default".
        use_flash_attention_2 (bool): Whether to use flash attention version 2. Default is False.
        log (str): Logging method for experiment tracking. Default is "none".
        disable_gradient_checkpointing (bool): Whether to disable gradient checkpointing. Default is False.
        logging_steps (int): Number of steps between logging events. Default is -1.
        eval_strategy (str): Strategy for evaluation (e.g., 'epoch'). Default is "epoch".
        save_total_limit (int): Maximum number of checkpoints to keep. Default is 1.
        auto_find_batch_size (bool): Whether to automatically find the optimal batch size. Default is False.
        mixed_precision (Optional[str]): Type of mixed precision to use (e.g., 'fp16', 'bf16', or None). Default is None.
        lr (float): Learning rate for training. Default is 3e-5.
        epochs (int): Number of training epochs. Default is 1.
        batch_size (int): Batch size for training. Default is 2.
        warmup_ratio (float): Proportion of training to perform learning rate warmup. Default is 0.1.
        gradient_accumulation (int): Number of steps to accumulate gradients before updating. Default is 4.
        optimizer (str): Optimizer to use for training. Default is "adamw_torch".
        scheduler (str): Learning rate scheduler to use. Default is "linear".
        weight_decay (float): Weight decay to apply to the optimizer. Default is 0.0.
        max_grad_norm (float): Maximum norm for gradient clipping. Default is 1.0.
        seed (int): Random seed for reproducibility. Default is 42.
        chat_template (Optional[str]): Template for chat-based models, options include: None, zephyr, chatml, or tokenizer. Default is None.

        quantization (Optional[str]): Quantization method to use (e.g., 'int4', 'int8', or None). Default is "int4".
        target_modules (Optional[str]): Target modules for quantization or fine-tuning. Default is "all-linear".
        merge_adapter (bool): Whether to merge the adapter layers. Default is False.
        peft (bool): Whether to use Parameter-Efficient Fine-Tuning (PEFT). Default is False.
        lora_r (int): Rank of the LoRA matrices. Default is 16.
        lora_alpha (int): Alpha parameter for LoRA. Default is 32.
        lora_dropout (float): Dropout rate for LoRA. Default is 0.05.

        model_ref (Optional[str]): Reference model for DPO trainer. Default is None.
        dpo_beta (float): Beta parameter for DPO trainer. Default is 0.1.

        max_prompt_length (int): Maximum length of the prompt. Default is 128.
        max_completion_length (Optional[int]): Maximum length of the completion. Default is None.

        prompt_text_column (Optional[str]): Column name for the prompt text. Default is None.
        text_column (str): Column name for the text data. Default is "text".
        rejected_text_column (Optional[str]): Column name for the rejected text data. Default is None.

        push_to_hub (bool): Whether to push the model to the Hugging Face Hub. Default is False.
        username (Optional[str]): Hugging Face username for authentication. Default is None.
        token (Optional[str]): Hugging Face token for authentication. Default is None.

        unsloth (bool): Whether to use the unsloth library. Default is False.
        distributed_backend (Optional[str]): Backend to use for distributed training. Default is None.
    """

    model: str = Field("gpt2", title="Model name to be used for training")
    project_name: str = Field("project-name", title="Name of the project and output directory")

    # data params
    data_path: str = Field("data", title="Path to the dataset")
    train_split: str = Field("train", title="Configuration for the training data split")
    valid_split: Optional[str] = Field(None, title="Configuration for the validation data split")
    add_eos_token: bool = Field(True, title="Whether to add an EOS token at the end of sequences")
    block_size: Union[int, List[int]] = Field(
        -1, title="Size of the blocks for training, can be a single integer or a list of integers"
    )
    model_max_length: int = Field(2048, title="Maximum length of the model input")
    padding: Optional[str] = Field("right", title="Side on which to pad sequences (left or right)")

    # trainer params
    trainer: str = Field("default", title="Type of trainer to use")
    use_flash_attention_2: bool = Field(False, title="Whether to use flash attention version 2")
    log: str = Field("none", title="Logging method for experiment tracking")
    disable_gradient_checkpointing: bool = Field(False, title="Whether to disable gradient checkpointing")
    logging_steps: int = Field(-1, title="Number of steps between logging events")
    eval_strategy: str = Field("epoch", title="Strategy for evaluation (e.g., 'epoch')")
    save_total_limit: int = Field(1, title="Maximum number of checkpoints to keep")
    auto_find_batch_size: bool = Field(False, title="Whether to automatically find the optimal batch size")
    mixed_precision: Optional[str] = Field(
        None, title="Type of mixed precision to use (e.g., 'fp16', 'bf16', or None)"
    )
    lr: float = Field(3e-5, title="Learning rate for training")
    epochs: int = Field(1, title="Number of training epochs")
    batch_size: int = Field(2, title="Batch size for training")
    warmup_ratio: float = Field(0.1, title="Proportion of training to perform learning rate warmup")
    gradient_accumulation: int = Field(4, title="Number of steps to accumulate gradients before updating")
    optimizer: str = Field("adamw_torch", title="Optimizer to use for training")
    scheduler: str = Field("linear", title="Learning rate scheduler to use")
    weight_decay: float = Field(0.0, title="Weight decay to apply to the optimizer")
    max_grad_norm: float = Field(1.0, title="Maximum norm for gradient clipping")
    seed: int = Field(42, title="Random seed for reproducibility")
    chat_template: Optional[str] = Field(
        None, title="Template for chat-based models, options include: None, zephyr, chatml, or tokenizer"
    )

    # peft
    quantization: Optional[str] = Field("int4", title="Quantization method to use (e.g., 'int4', 'int8', or None)")
    target_modules: Optional[str] = Field("all-linear", title="Target modules for quantization or fine-tuning")
    merge_adapter: bool = Field(False, title="Whether to merge the adapter layers")
    peft: bool = Field(False, title="Whether to use Parameter-Efficient Fine-Tuning (PEFT)")
    lora_r: int = Field(16, title="Rank of the LoRA matrices")
    lora_alpha: int = Field(32, title="Alpha parameter for LoRA")
    lora_dropout: float = Field(0.05, title="Dropout rate for LoRA")

    # dpo
    model_ref: Optional[str] = Field(None, title="Reference model for DPO trainer")
    dpo_beta: float = Field(0.1, title="Beta parameter for DPO trainer")

    # orpo + dpo
    max_prompt_length: int = Field(128, title="Maximum length of the prompt")
    max_completion_length: Optional[int] = Field(None, title="Maximum length of the completion")

    # column mappings
    prompt_text_column: Optional[str] = Field(None, title="Column name for the prompt text")
    text_column: str = Field("text", title="Column name for the text data")
    rejected_text_column: Optional[str] = Field(None, title="Column name for the rejected text data")

    # push to hub
    push_to_hub: bool = Field(False, title="Whether to push the model to the Hugging Face Hub")
    username: Optional[str] = Field(None, title="Hugging Face username for authentication")
    token: Optional[str] = Field(None, title="Hugging Face token for authentication")

    # unsloth
    unsloth: bool = Field(False, title="Whether to use the unsloth library")
    distributed_backend: Optional[str] = Field(None, title="Backend to use for distributed training")