mjschock's picture
Refactor train.py to utilize a comprehensive configuration structure from config.yaml, enhancing model loading, dataset handling, and trainer setup. This update centralizes parameters for model, PEFT, dataset, and training settings, improving maintainability and flexibility.
611c848 unverified
raw
history blame
1.6 kB
defaults:
- _self_
# Model configuration
model:
name: "unsloth/SmolLM2-135M-Instruct-bnb-4bit"
max_seq_length: 2048 # Auto supports RoPE Scaling internally
dtype: null # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit: true # Use 4bit quantization to reduce memory usage
# PEFT configuration
peft:
r: 64
lora_alpha: 128
lora_dropout: 0.05
bias: "none"
use_gradient_checkpointing: "unsloth"
random_state: 3407
use_rslora: true
loftq_config: null
target_modules:
- "q_proj"
- "k_proj"
- "v_proj"
- "o_proj"
- "gate_proj"
- "up_proj"
- "down_proj"
# Dataset configuration
dataset:
validation_split: 0.1 # 10% of data for validation
seed: 3407 # Random seed for dataset splitting
# Training configuration
training:
args:
per_device_train_batch_size: 2
per_device_eval_batch_size: 2
gradient_accumulation_steps: 16
warmup_steps: 100
max_steps: 120
learning_rate: 5e-5
logging_steps: 1
save_strategy: "steps"
save_steps: 30
eval_strategy: "steps"
eval_steps: 30
save_total_limit: 2
optim: "adamw_8bit"
weight_decay: 0.01
lr_scheduler_type: "cosine_with_restarts"
seed: 3407
output_dir: "outputs"
gradient_checkpointing: true
load_best_model_at_end: true
metric_for_best_model: "eval_loss"
greater_is_better: false
sft:
dataset_num_proc: 2
packing: false
data_collator:
mlm: false
pad_to_multiple_of: 8
# Output configuration
output:
dir: "final_model"
# Training control
train: false