Training in progress, step 2500, checkpoint

Browse files

Files changed (5) hide show

last-checkpoint/model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +370 -4

last-checkpoint/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:537e142cd2371eb91c3fac0d760b3957a618cf37420cfd935cd420673c5fa1a8
 size 4140280

 version https://git-lfs.github.com/spec/v1
+oid sha256:1985749813398ab678fa16f868bbbedc4997e1118b6ce01f98888b06f4bc92c0
 size 4140280

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e999681c9f49743777afd3a52692f608fd37b84d8c766f1f3ba99c7e48e2b6ed
 size 4291766

 version https://git-lfs.github.com/spec/v1
+oid sha256:8ac70b08e8cc492bc90f894d12a59aabefeae26eb6f1e2f8d8d611266df82c50
 size 4291766

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:19ab3d6cfcb43de67f16e412d0cb4f86309db602f8242d16f2b203a0212d6cbb
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:97474a95cf2d0e6166f036d8937e33ebebb2adb23cf1177f88edc10dc549c905
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c88b3aeb8ec2bf995149291b90b69667d3f268ff2f13afbeab1a220b8cc27590
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:7b204b20cdc92a140e2e21e015bdaa04af008c00e0bde30e59edf0f23817a338
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.5918910920390648,
   "eval_steps": 200,
-  "global_step": 2000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -1495,6 +1495,372 @@
       "eval_samples_per_second": 136.288,
       "eval_steps_per_second": 34.117,
       "step": 2000
     }
   ],
   "logging_steps": 10,
@@ -1509,12 +1875,12 @@
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
-        "should_training_stop": false
       },
       "attributes": {}
     }
   },
-  "total_flos": 102983710801920.0,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.7398638650488311,
   "eval_steps": 200,
+  "global_step": 2500,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 136.288,
       "eval_steps_per_second": 34.117,
       "step": 2000
+    },
+    {
+      "epoch": 0.5948505474992601,
+      "grad_norm": 0.69921875,
+      "learning_rate": 2.0280410844810428e-05,
+      "loss": 8.0312,
+      "step": 2010
+    },
+    {
+      "epoch": 0.5978100029594554,
+      "grad_norm": 0.62890625,
+      "learning_rate": 1.9488821249060297e-05,
+      "loss": 8.0663,
+      "step": 2020
+    },
+    {
+      "epoch": 0.6007694584196508,
+      "grad_norm": 0.7109375,
+      "learning_rate": 1.871131877836879e-05,
+      "loss": 8.0353,
+      "step": 2030
+    },
+    {
+      "epoch": 0.6037289138798461,
+      "grad_norm": 0.80859375,
+      "learning_rate": 1.7948039473155554e-05,
+      "loss": 7.9639,
+      "step": 2040
+    },
+    {
+      "epoch": 0.6066883693400414,
+      "grad_norm": 1.4765625,
+      "learning_rate": 1.7199116885197995e-05,
+      "loss": 8.0724,
+      "step": 2050
+    },
+    {
+      "epoch": 0.6096478248002367,
+      "grad_norm": 0.6953125,
+      "learning_rate": 1.646468205426377e-05,
+      "loss": 8.0524,
+      "step": 2060
+    },
+    {
+      "epoch": 0.6126072802604321,
+      "grad_norm": 0.59375,
+      "learning_rate": 1.5744863485182537e-05,
+      "loss": 8.0682,
+      "step": 2070
+    },
+    {
+      "epoch": 0.6155667357206274,
+      "grad_norm": 0.71484375,
+      "learning_rate": 1.5039787125361326e-05,
+      "loss": 8.0506,
+      "step": 2080
+    },
+    {
+      "epoch": 0.6185261911808227,
+      "grad_norm": 1.015625,
+      "learning_rate": 1.4349576342747462e-05,
+      "loss": 8.0654,
+      "step": 2090
+    },
+    {
+      "epoch": 0.621485646641018,
+      "grad_norm": 1.6875,
+      "learning_rate": 1.3674351904242611e-05,
+      "loss": 7.922,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6244451021012134,
+      "grad_norm": 0.625,
+      "learning_rate": 1.3014231954572287e-05,
+      "loss": 8.0273,
+      "step": 2110
+    },
+    {
+      "epoch": 0.6274045575614087,
+      "grad_norm": 0.8203125,
+      "learning_rate": 1.2369331995613665e-05,
+      "loss": 8.0164,
+      "step": 2120
+    },
+    {
+      "epoch": 0.630364013021604,
+      "grad_norm": 0.76953125,
+      "learning_rate": 1.173976486618631e-05,
+      "loss": 8.0,
+      "step": 2130
+    },
+    {
+      "epoch": 0.6333234684817993,
+      "grad_norm": 0.95703125,
+      "learning_rate": 1.1125640722308628e-05,
+      "loss": 8.1147,
+      "step": 2140
+    },
+    {
+      "epoch": 0.6362829239419947,
+      "grad_norm": 1.578125,
+      "learning_rate": 1.0527067017923654e-05,
+      "loss": 7.8947,
+      "step": 2150
+    },
+    {
+      "epoch": 0.63924237940219,
+      "grad_norm": 0.58984375,
+      "learning_rate": 9.944148486097793e-06,
+      "loss": 8.0644,
+      "step": 2160
+    },
+    {
+      "epoch": 0.6422018348623854,
+      "grad_norm": 0.59765625,
+      "learning_rate": 9.376987120695545e-06,
+      "loss": 8.0381,
+      "step": 2170
+    },
+    {
+      "epoch": 0.6451612903225806,
+      "grad_norm": 0.75,
+      "learning_rate": 8.825682158533554e-06,
+      "loss": 7.944,
+      "step": 2180
+    },
+    {
+      "epoch": 0.648120745782776,
+      "grad_norm": 0.94140625,
+      "learning_rate": 8.290330062017016e-06,
+      "loss": 8.0418,
+      "step": 2190
+    },
+    {
+      "epoch": 0.6510802012429713,
+      "grad_norm": 1.3359375,
+      "learning_rate": 7.771024502261526e-06,
+      "loss": 8.0265,
+      "step": 2200
+    },
+    {
+      "epoch": 0.6510802012429713,
+      "eval_loss": 8.027557373046875,
+      "eval_runtime": 11.1335,
+      "eval_samples_per_second": 134.908,
+      "eval_steps_per_second": 33.772,
+      "step": 2200
+    },
+    {
+      "epoch": 0.6540396567031667,
+      "grad_norm": 0.5703125,
+      "learning_rate": 7.267856342703461e-06,
+      "loss": 8.093,
+      "step": 2210
+    },
+    {
+      "epoch": 0.6569991121633619,
+      "grad_norm": 0.69140625,
+      "learning_rate": 6.780913623201346e-06,
+      "loss": 7.9956,
+      "step": 2220
+    },
+    {
+      "epoch": 0.6599585676235573,
+      "grad_norm": 0.8125,
+      "learning_rate": 6.310281544631546e-06,
+      "loss": 8.0602,
+      "step": 2230
+    },
+    {
+      "epoch": 0.6629180230837526,
+      "grad_norm": 0.83203125,
+      "learning_rate": 5.856042453980526e-06,
+      "loss": 8.0176,
+      "step": 2240
+    },
+    {
+      "epoch": 0.665877478543948,
+      "grad_norm": 1.78125,
+      "learning_rate": 5.418275829936537e-06,
+      "loss": 7.899,
+      "step": 2250
+    },
+    {
+      "epoch": 0.6688369340041432,
+      "grad_norm": 0.6015625,
+      "learning_rate": 4.997058268983135e-06,
+      "loss": 8.1306,
+      "step": 2260
+    },
+    {
+      "epoch": 0.6717963894643386,
+      "grad_norm": 0.6875,
+      "learning_rate": 4.592463471997022e-06,
+      "loss": 8.0341,
+      "step": 2270
+    },
+    {
+      "epoch": 0.6747558449245339,
+      "grad_norm": 0.62109375,
+      "learning_rate": 4.204562231352516e-06,
+      "loss": 8.0087,
+      "step": 2280
+    },
+    {
+      "epoch": 0.6777153003847292,
+      "grad_norm": 0.765625,
+      "learning_rate": 3.83342241853496e-06,
+      "loss": 8.0114,
+      "step": 2290
+    },
+    {
+      "epoch": 0.6806747558449245,
+      "grad_norm": 1.625,
+      "learning_rate": 3.4791089722651436e-06,
+      "loss": 7.88,
+      "step": 2300
+    },
+    {
+      "epoch": 0.6836342113051198,
+      "grad_norm": 0.72265625,
+      "learning_rate": 3.1416838871368924e-06,
+      "loss": 7.9878,
+      "step": 2310
+    },
+    {
+      "epoch": 0.6865936667653152,
+      "grad_norm": 0.6328125,
+      "learning_rate": 2.821206202769899e-06,
+      "loss": 8.0762,
+      "step": 2320
+    },
+    {
+      "epoch": 0.6895531222255105,
+      "grad_norm": 1.046875,
+      "learning_rate": 2.5177319934794e-06,
+      "loss": 7.9838,
+      "step": 2330
+    },
+    {
+      "epoch": 0.6925125776857058,
+      "grad_norm": 0.87109375,
+      "learning_rate": 2.2313143584648423e-06,
+      "loss": 8.0461,
+      "step": 2340
+    },
+    {
+      "epoch": 0.6954720331459011,
+      "grad_norm": 1.515625,
+      "learning_rate": 1.9620034125190644e-06,
+      "loss": 8.1426,
+      "step": 2350
+    },
+    {
+      "epoch": 0.6984314886060965,
+      "grad_norm": 0.55859375,
+      "learning_rate": 1.7098462772596302e-06,
+      "loss": 7.9893,
+      "step": 2360
+    },
+    {
+      "epoch": 0.7013909440662918,
+      "grad_norm": 0.66015625,
+      "learning_rate": 1.4748870728839347e-06,
+      "loss": 8.0056,
+      "step": 2370
+    },
+    {
+      "epoch": 0.7043503995264871,
+      "grad_norm": 0.58984375,
+      "learning_rate": 1.2571669104494256e-06,
+      "loss": 8.0057,
+      "step": 2380
+    },
+    {
+      "epoch": 0.7073098549866824,
+      "grad_norm": 0.9765625,
+      "learning_rate": 1.0567238846803996e-06,
+      "loss": 7.9673,
+      "step": 2390
+    },
+    {
+      "epoch": 0.7102693104468778,
+      "grad_norm": 1.03125,
+      "learning_rate": 8.735930673024806e-07,
+      "loss": 7.8113,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7102693104468778,
+      "eval_loss": 8.03307819366455,
+      "eval_runtime": 11.143,
+      "eval_samples_per_second": 134.794,
+      "eval_steps_per_second": 33.743,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7132287659070731,
+      "grad_norm": 0.6171875,
+      "learning_rate": 7.078065009060941e-07,
+      "loss": 7.9873,
+      "step": 2410
+    },
+    {
+      "epoch": 0.7161882213672685,
+      "grad_norm": 0.62109375,
+      "learning_rate": 5.593931933399854e-07,
+      "loss": 8.0612,
+      "step": 2420
+    },
+    {
+      "epoch": 0.7191476768274637,
+      "grad_norm": 0.69140625,
+      "learning_rate": 4.2837911263562404e-07,
+      "loss": 8.0285,
+      "step": 2430
+    },
+    {
+      "epoch": 0.7221071322876591,
+      "grad_norm": 0.921875,
+      "learning_rate": 3.1478718246357173e-07,
+      "loss": 8.0167,
+      "step": 2440
+    },
+    {
+      "epoch": 0.7250665877478544,
+      "grad_norm": 1.8203125,
+      "learning_rate": 2.1863727812254653e-07,
+      "loss": 8.1032,
+      "step": 2450
+    },
+    {
+      "epoch": 0.7280260432080498,
+      "grad_norm": 0.5390625,
+      "learning_rate": 1.3994622306173765e-07,
+      "loss": 8.0447,
+      "step": 2460
+    },
+    {
+      "epoch": 0.730985498668245,
+      "grad_norm": 0.609375,
+      "learning_rate": 7.872778593728258e-08,
+      "loss": 8.0293,
+      "step": 2470
+    },
+    {
+      "epoch": 0.7339449541284404,
+      "grad_norm": 0.78125,
+      "learning_rate": 3.499267820307184e-08,
+      "loss": 8.0606,
+      "step": 2480
+    },
+    {
+      "epoch": 0.7369044095886357,
+      "grad_norm": 0.8515625,
+      "learning_rate": 8.748552236603757e-09,
+      "loss": 8.0531,
+      "step": 2490
+    },
+    {
+      "epoch": 0.7398638650488311,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.0,
+      "loss": 7.9245,
+      "step": 2500
     }
   ],
   "logging_steps": 10,
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
+        "should_training_stop": true
       },
       "attributes": {}
     }
   },
+  "total_flos": 128870887981056.0,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null