tangledgroup
/

tangled-alpha-0.8-core

@@ -61,7 +61,7 @@ train:
   global_batch_size: 512
   # Number of samples per data-parallel rank (type: int, default: 4)
-  micro_batch_size: 1
   # Number of iterations with learning rate warmup active (type: int, default: 2000)
   lr_warmup_steps: 500
@@ -132,23 +132,23 @@ eval:
 #     rho: 0.05
 #     weight_decay: 0.1
-# optimizer:
-#   class_path: grokadamw.GrokAdamW
-#   init_args:
-#     # (type: float, default: 0.001)
-#     lr: 3e-4
-#     # (type: float, default: 0.01)
-#     weight_decay: 1e-2
-#     # (type: tuple, default: (0.9,0.999))
-#     betas:
-#       - 0.9
-#       - 0.999
 optimizer:
-  class_path: muon.Muon
   init_args:
     lr: 3e-4
-    weight_decay: 0.01
 # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
 devices: auto

   global_batch_size: 512
   # Number of samples per data-parallel rank (type: int, default: 4)
+  micro_batch_size: 2
   # Number of iterations with learning rate warmup active (type: int, default: 2000)
   lr_warmup_steps: 500
 #     rho: 0.05
 #     weight_decay: 0.1
 optimizer:
+  class_path: grokadamw.GrokAdamW
   init_args:
+    # (type: float, default: 0.001)
     lr: 3e-4
+    # (type: float, default: 0.01)
+    weight_decay: 1e-2
+    # (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
+# optimizer:
+#   class_path: muon.Muon
+#   init_args:
+#     lr: 3e-4
+#     weight_decay: 0.01
 # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
 devices: auto