File size: 1,603 Bytes
5bfd071
 
 
611c848
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5bfd071
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
defaults:
  - _self_

# Model configuration
model:
  name: "unsloth/SmolLM2-135M-Instruct-bnb-4bit"
  max_seq_length: 2048  # Auto supports RoPE Scaling internally
  dtype: null  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
  load_in_4bit: true  # Use 4bit quantization to reduce memory usage

# PEFT configuration
peft:
  r: 64
  lora_alpha: 128
  lora_dropout: 0.05
  bias: "none"
  use_gradient_checkpointing: "unsloth"
  random_state: 3407
  use_rslora: true
  loftq_config: null
  target_modules:
    - "q_proj"
    - "k_proj"
    - "v_proj"
    - "o_proj"
    - "gate_proj"
    - "up_proj"
    - "down_proj"

# Dataset configuration
dataset:
  validation_split: 0.1  # 10% of data for validation
  seed: 3407  # Random seed for dataset splitting

# Training configuration
training:
  args:
    per_device_train_batch_size: 2
    per_device_eval_batch_size: 2
    gradient_accumulation_steps: 16
    warmup_steps: 100
    max_steps: 120
    learning_rate: 5e-5
    logging_steps: 1
    save_strategy: "steps"
    save_steps: 30
    eval_strategy: "steps"
    eval_steps: 30
    save_total_limit: 2
    optim: "adamw_8bit"
    weight_decay: 0.01
    lr_scheduler_type: "cosine_with_restarts"
    seed: 3407
    output_dir: "outputs"
    gradient_checkpointing: true
    load_best_model_at_end: true
    metric_for_best_model: "eval_loss"
    greater_is_better: false

  sft:
    dataset_num_proc: 2
    packing: false
    data_collator:
      mlm: false
      pad_to_multiple_of: 8

# Output configuration
output:
  dir: "final_model"

# Training control
train: false