mtasic85 commited on
Commit
2bbb90f
·
1 Parent(s): c634109

global_batch_size: 512; micro_batch_size: 2; class_path: grokadamw.GrokAdamW

Browse files
Files changed (1) hide show
  1. scripts/pretrain_core_model.yaml +15 -15
scripts/pretrain_core_model.yaml CHANGED
@@ -61,7 +61,7 @@ train:
61
  global_batch_size: 512
62
 
63
  # Number of samples per data-parallel rank (type: int, default: 4)
64
- micro_batch_size: 1
65
 
66
  # Number of iterations with learning rate warmup active (type: int, default: 2000)
67
  lr_warmup_steps: 500
@@ -132,23 +132,23 @@ eval:
132
  # rho: 0.05
133
  # weight_decay: 0.1
134
 
135
- # optimizer:
136
- # class_path: grokadamw.GrokAdamW
137
- # init_args:
138
- # # (type: float, default: 0.001)
139
- # lr: 3e-4
140
- # # (type: float, default: 0.01)
141
- # weight_decay: 1e-2
142
- # # (type: tuple, default: (0.9,0.999))
143
- # betas:
144
- # - 0.9
145
- # - 0.999
146
-
147
  optimizer:
148
- class_path: muon.Muon
149
  init_args:
 
150
  lr: 3e-4
151
- weight_decay: 0.01
 
 
 
 
 
 
 
 
 
 
 
152
 
153
  # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
154
  devices: auto
 
61
  global_batch_size: 512
62
 
63
  # Number of samples per data-parallel rank (type: int, default: 4)
64
+ micro_batch_size: 2
65
 
66
  # Number of iterations with learning rate warmup active (type: int, default: 2000)
67
  lr_warmup_steps: 500
 
132
  # rho: 0.05
133
  # weight_decay: 0.1
134
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  optimizer:
136
+ class_path: grokadamw.GrokAdamW
137
  init_args:
138
+ # (type: float, default: 0.001)
139
  lr: 3e-4
140
+ # (type: float, default: 0.01)
141
+ weight_decay: 1e-2
142
+ # (type: tuple, default: (0.9,0.999))
143
+ betas:
144
+ - 0.9
145
+ - 0.95
146
+
147
+ # optimizer:
148
+ # class_path: muon.Muon
149
+ # init_args:
150
+ # lr: 3e-4
151
+ # weight_decay: 0.01
152
 
153
  # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
154
  devices: auto