global_batch_size: 512; micro_batch_size: 2; class_path: grokadamw.GrokAdamW
Browse files- scripts/pretrain_core_model.yaml +15 -15
scripts/pretrain_core_model.yaml
CHANGED
@@ -61,7 +61,7 @@ train:
|
|
61 |
global_batch_size: 512
|
62 |
|
63 |
# Number of samples per data-parallel rank (type: int, default: 4)
|
64 |
-
micro_batch_size:
|
65 |
|
66 |
# Number of iterations with learning rate warmup active (type: int, default: 2000)
|
67 |
lr_warmup_steps: 500
|
@@ -132,23 +132,23 @@ eval:
|
|
132 |
# rho: 0.05
|
133 |
# weight_decay: 0.1
|
134 |
|
135 |
-
# optimizer:
|
136 |
-
# class_path: grokadamw.GrokAdamW
|
137 |
-
# init_args:
|
138 |
-
# # (type: float, default: 0.001)
|
139 |
-
# lr: 3e-4
|
140 |
-
# # (type: float, default: 0.01)
|
141 |
-
# weight_decay: 1e-2
|
142 |
-
# # (type: tuple, default: (0.9,0.999))
|
143 |
-
# betas:
|
144 |
-
# - 0.9
|
145 |
-
# - 0.999
|
146 |
-
|
147 |
optimizer:
|
148 |
-
class_path:
|
149 |
init_args:
|
|
|
150 |
lr: 3e-4
|
151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
|
153 |
# How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
|
154 |
devices: auto
|
|
|
61 |
global_batch_size: 512
|
62 |
|
63 |
# Number of samples per data-parallel rank (type: int, default: 4)
|
64 |
+
micro_batch_size: 2
|
65 |
|
66 |
# Number of iterations with learning rate warmup active (type: int, default: 2000)
|
67 |
lr_warmup_steps: 500
|
|
|
132 |
# rho: 0.05
|
133 |
# weight_decay: 0.1
|
134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
optimizer:
|
136 |
+
class_path: grokadamw.GrokAdamW
|
137 |
init_args:
|
138 |
+
# (type: float, default: 0.001)
|
139 |
lr: 3e-4
|
140 |
+
# (type: float, default: 0.01)
|
141 |
+
weight_decay: 1e-2
|
142 |
+
# (type: tuple, default: (0.9,0.999))
|
143 |
+
betas:
|
144 |
+
- 0.9
|
145 |
+
- 0.95
|
146 |
+
|
147 |
+
# optimizer:
|
148 |
+
# class_path: muon.Muon
|
149 |
+
# init_args:
|
150 |
+
# lr: 3e-4
|
151 |
+
# weight_decay: 0.01
|
152 |
|
153 |
# How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
|
154 |
devices: auto
|