selection_checkpoint / 0_trainer_state.json
mjleee's picture
Upload folder using huggingface_hub
52f89f7 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 2500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008,
"grad_norm": 4.92004919052124,
"learning_rate": 1.6e-07,
"loss": 7.328,
"step": 2
},
{
"epoch": 0.0016,
"grad_norm": 4.8694047927856445,
"learning_rate": 3.2e-07,
"loss": 7.5093,
"step": 4
},
{
"epoch": 0.0024,
"grad_norm": 5.063427448272705,
"learning_rate": 4.800000000000001e-07,
"loss": 7.2167,
"step": 6
},
{
"epoch": 0.0032,
"grad_norm": 5.227718830108643,
"learning_rate": 6.4e-07,
"loss": 7.6708,
"step": 8
},
{
"epoch": 0.004,
"grad_norm": 5.120015621185303,
"learning_rate": 8.000000000000001e-07,
"loss": 7.493,
"step": 10
},
{
"epoch": 0.0048,
"grad_norm": 4.556617736816406,
"learning_rate": 9.600000000000001e-07,
"loss": 7.114,
"step": 12
},
{
"epoch": 0.0056,
"grad_norm": 4.134003162384033,
"learning_rate": 1.12e-06,
"loss": 7.0582,
"step": 14
},
{
"epoch": 0.0064,
"grad_norm": 3.023643970489502,
"learning_rate": 1.28e-06,
"loss": 6.2808,
"step": 16
},
{
"epoch": 0.0072,
"grad_norm": 2.4297778606414795,
"learning_rate": 1.44e-06,
"loss": 6.353,
"step": 18
},
{
"epoch": 0.008,
"grad_norm": 2.867222785949707,
"learning_rate": 1.6000000000000001e-06,
"loss": 6.0667,
"step": 20
},
{
"epoch": 0.0088,
"grad_norm": 1.387591004371643,
"learning_rate": 1.76e-06,
"loss": 5.6459,
"step": 22
},
{
"epoch": 0.0096,
"grad_norm": 2.594311475753784,
"learning_rate": 1.9200000000000003e-06,
"loss": 5.8506,
"step": 24
},
{
"epoch": 0.0104,
"grad_norm": 1.8094066381454468,
"learning_rate": 2.08e-06,
"loss": 5.6573,
"step": 26
},
{
"epoch": 0.0112,
"grad_norm": 0.9034324288368225,
"learning_rate": 2.24e-06,
"loss": 5.231,
"step": 28
},
{
"epoch": 0.012,
"grad_norm": 1.4179000854492188,
"learning_rate": 2.4000000000000003e-06,
"loss": 5.3384,
"step": 30
},
{
"epoch": 0.0128,
"grad_norm": 1.3942221403121948,
"learning_rate": 2.56e-06,
"loss": 5.385,
"step": 32
},
{
"epoch": 0.0136,
"grad_norm": 1.5524358749389648,
"learning_rate": 2.7200000000000002e-06,
"loss": 4.8145,
"step": 34
},
{
"epoch": 0.0144,
"grad_norm": 1.9171744585037231,
"learning_rate": 2.88e-06,
"loss": 5.5795,
"step": 36
},
{
"epoch": 0.0152,
"grad_norm": 2.5012264251708984,
"learning_rate": 3.04e-06,
"loss": 5.1458,
"step": 38
},
{
"epoch": 0.016,
"grad_norm": 1.894866943359375,
"learning_rate": 3.2000000000000003e-06,
"loss": 4.7375,
"step": 40
},
{
"epoch": 0.0168,
"grad_norm": 1.083099365234375,
"learning_rate": 3.3600000000000004e-06,
"loss": 4.659,
"step": 42
},
{
"epoch": 0.0176,
"grad_norm": 1.7351473569869995,
"learning_rate": 3.52e-06,
"loss": 4.3309,
"step": 44
},
{
"epoch": 0.0184,
"grad_norm": 1.2516050338745117,
"learning_rate": 3.6800000000000003e-06,
"loss": 4.2946,
"step": 46
},
{
"epoch": 0.0192,
"grad_norm": 1.946075439453125,
"learning_rate": 3.8400000000000005e-06,
"loss": 4.2101,
"step": 48
},
{
"epoch": 0.02,
"grad_norm": 2.922369956970215,
"learning_rate": 4.000000000000001e-06,
"loss": 4.5236,
"step": 50
},
{
"epoch": 0.0208,
"grad_norm": 2.0061962604522705,
"learning_rate": 4.16e-06,
"loss": 3.6378,
"step": 52
},
{
"epoch": 0.0216,
"grad_norm": 2.2109129428863525,
"learning_rate": 4.32e-06,
"loss": 3.1168,
"step": 54
},
{
"epoch": 0.0224,
"grad_norm": 2.374847173690796,
"learning_rate": 4.48e-06,
"loss": 2.9689,
"step": 56
},
{
"epoch": 0.0232,
"grad_norm": 2.2352538108825684,
"learning_rate": 4.6400000000000005e-06,
"loss": 2.2496,
"step": 58
},
{
"epoch": 0.024,
"grad_norm": 1.657749891281128,
"learning_rate": 4.800000000000001e-06,
"loss": 2.9017,
"step": 60
},
{
"epoch": 0.0248,
"grad_norm": 2.9958770275115967,
"learning_rate": 4.960000000000001e-06,
"loss": 2.094,
"step": 62
},
{
"epoch": 0.0256,
"grad_norm": 1.8515348434448242,
"learning_rate": 5.12e-06,
"loss": 1.9278,
"step": 64
},
{
"epoch": 0.0264,
"grad_norm": 2.1560873985290527,
"learning_rate": 5.28e-06,
"loss": 1.7917,
"step": 66
},
{
"epoch": 0.0272,
"grad_norm": 3.1304805278778076,
"learning_rate": 5.4400000000000004e-06,
"loss": 1.9759,
"step": 68
},
{
"epoch": 0.028,
"grad_norm": 3.0489299297332764,
"learning_rate": 5.600000000000001e-06,
"loss": 1.8824,
"step": 70
},
{
"epoch": 0.0288,
"grad_norm": 3.545546054840088,
"learning_rate": 5.76e-06,
"loss": 2.0749,
"step": 72
},
{
"epoch": 0.0296,
"grad_norm": 2.089024305343628,
"learning_rate": 5.92e-06,
"loss": 1.0014,
"step": 74
},
{
"epoch": 0.0304,
"grad_norm": 1.475943922996521,
"learning_rate": 6.08e-06,
"loss": 1.6105,
"step": 76
},
{
"epoch": 0.0312,
"grad_norm": 1.4888969659805298,
"learning_rate": 6.24e-06,
"loss": 1.2915,
"step": 78
},
{
"epoch": 0.032,
"grad_norm": 1.1156069040298462,
"learning_rate": 6.4000000000000006e-06,
"loss": 0.6639,
"step": 80
},
{
"epoch": 0.0328,
"grad_norm": 0.5972667932510376,
"learning_rate": 6.560000000000001e-06,
"loss": 0.4398,
"step": 82
},
{
"epoch": 0.0336,
"grad_norm": 1.5191650390625,
"learning_rate": 6.720000000000001e-06,
"loss": 1.0727,
"step": 84
},
{
"epoch": 0.0344,
"grad_norm": 4.311778545379639,
"learning_rate": 6.88e-06,
"loss": 1.6288,
"step": 86
},
{
"epoch": 0.0352,
"grad_norm": 2.106018543243408,
"learning_rate": 7.04e-06,
"loss": 0.5208,
"step": 88
},
{
"epoch": 0.036,
"grad_norm": 2.2589375972747803,
"learning_rate": 7.2000000000000005e-06,
"loss": 0.4426,
"step": 90
},
{
"epoch": 0.0368,
"grad_norm": 8.470863342285156,
"learning_rate": 7.360000000000001e-06,
"loss": 2.5704,
"step": 92
},
{
"epoch": 0.0376,
"grad_norm": 3.564351797103882,
"learning_rate": 7.520000000000001e-06,
"loss": 0.9683,
"step": 94
},
{
"epoch": 0.0384,
"grad_norm": 3.827845573425293,
"learning_rate": 7.680000000000001e-06,
"loss": 0.5685,
"step": 96
},
{
"epoch": 0.0392,
"grad_norm": 3.611189842224121,
"learning_rate": 7.840000000000001e-06,
"loss": 1.1535,
"step": 98
},
{
"epoch": 0.04,
"grad_norm": 1.9307868480682373,
"learning_rate": 8.000000000000001e-06,
"loss": 0.6147,
"step": 100
},
{
"epoch": 0.0408,
"grad_norm": 3.3078556060791016,
"learning_rate": 8.16e-06,
"loss": 0.9248,
"step": 102
},
{
"epoch": 0.0416,
"grad_norm": 1.50742506980896,
"learning_rate": 8.32e-06,
"loss": 0.5147,
"step": 104
},
{
"epoch": 0.0424,
"grad_norm": 2.5260138511657715,
"learning_rate": 8.48e-06,
"loss": 0.5129,
"step": 106
},
{
"epoch": 0.0432,
"grad_norm": 0.7977801561355591,
"learning_rate": 8.64e-06,
"loss": 0.5833,
"step": 108
},
{
"epoch": 0.044,
"grad_norm": 3.389085054397583,
"learning_rate": 8.8e-06,
"loss": 0.4162,
"step": 110
},
{
"epoch": 0.0448,
"grad_norm": 3.584988832473755,
"learning_rate": 8.96e-06,
"loss": 1.2424,
"step": 112
},
{
"epoch": 0.0456,
"grad_norm": 0.4079441428184509,
"learning_rate": 9.12e-06,
"loss": 0.6532,
"step": 114
},
{
"epoch": 0.0464,
"grad_norm": 2.021636962890625,
"learning_rate": 9.280000000000001e-06,
"loss": 0.2637,
"step": 116
},
{
"epoch": 0.0472,
"grad_norm": 4.71144962310791,
"learning_rate": 9.440000000000001e-06,
"loss": 1.1383,
"step": 118
},
{
"epoch": 0.048,
"grad_norm": 1.831689715385437,
"learning_rate": 9.600000000000001e-06,
"loss": 0.7587,
"step": 120
},
{
"epoch": 0.0488,
"grad_norm": 2.6891868114471436,
"learning_rate": 9.760000000000001e-06,
"loss": 0.9849,
"step": 122
},
{
"epoch": 0.0496,
"grad_norm": 2.2791528701782227,
"learning_rate": 9.920000000000002e-06,
"loss": 0.74,
"step": 124
},
{
"epoch": 0.0504,
"grad_norm": 2.12807559967041,
"learning_rate": 1.008e-05,
"loss": 0.3742,
"step": 126
},
{
"epoch": 0.0512,
"grad_norm": 1.7115131616592407,
"learning_rate": 1.024e-05,
"loss": 0.8209,
"step": 128
},
{
"epoch": 0.052,
"grad_norm": 1.7355319261550903,
"learning_rate": 1.04e-05,
"loss": 1.1005,
"step": 130
},
{
"epoch": 0.0528,
"grad_norm": 1.7057139873504639,
"learning_rate": 1.056e-05,
"loss": 0.9056,
"step": 132
},
{
"epoch": 0.0536,
"grad_norm": 0.7381348609924316,
"learning_rate": 1.072e-05,
"loss": 0.4982,
"step": 134
},
{
"epoch": 0.0544,
"grad_norm": 3.6813762187957764,
"learning_rate": 1.0880000000000001e-05,
"loss": 0.8425,
"step": 136
},
{
"epoch": 0.0552,
"grad_norm": 0.6442511677742004,
"learning_rate": 1.1040000000000001e-05,
"loss": 0.4714,
"step": 138
},
{
"epoch": 0.056,
"grad_norm": 0.3334783613681793,
"learning_rate": 1.1200000000000001e-05,
"loss": 0.1425,
"step": 140
},
{
"epoch": 0.0568,
"grad_norm": 2.7860682010650635,
"learning_rate": 1.136e-05,
"loss": 1.0082,
"step": 142
},
{
"epoch": 0.0576,
"grad_norm": 1.0623384714126587,
"learning_rate": 1.152e-05,
"loss": 0.1404,
"step": 144
},
{
"epoch": 0.0584,
"grad_norm": 0.49509289860725403,
"learning_rate": 1.168e-05,
"loss": 0.3523,
"step": 146
},
{
"epoch": 0.0592,
"grad_norm": 0.9187251925468445,
"learning_rate": 1.184e-05,
"loss": 0.7012,
"step": 148
},
{
"epoch": 0.06,
"grad_norm": 2.251713514328003,
"learning_rate": 1.2e-05,
"loss": 0.3998,
"step": 150
},
{
"epoch": 0.0608,
"grad_norm": 3.6301510334014893,
"learning_rate": 1.216e-05,
"loss": 0.991,
"step": 152
},
{
"epoch": 0.0616,
"grad_norm": 2.2427806854248047,
"learning_rate": 1.232e-05,
"loss": 1.6949,
"step": 154
},
{
"epoch": 0.0624,
"grad_norm": 1.5521266460418701,
"learning_rate": 1.248e-05,
"loss": 0.9281,
"step": 156
},
{
"epoch": 0.0632,
"grad_norm": 0.9885201454162598,
"learning_rate": 1.2640000000000001e-05,
"loss": 0.3829,
"step": 158
},
{
"epoch": 0.064,
"grad_norm": 1.02154541015625,
"learning_rate": 1.2800000000000001e-05,
"loss": 0.1986,
"step": 160
},
{
"epoch": 0.0648,
"grad_norm": 2.9700698852539062,
"learning_rate": 1.2960000000000001e-05,
"loss": 0.5764,
"step": 162
},
{
"epoch": 0.0656,
"grad_norm": 1.9716620445251465,
"learning_rate": 1.3120000000000001e-05,
"loss": 0.4319,
"step": 164
},
{
"epoch": 0.0664,
"grad_norm": 0.23318687081336975,
"learning_rate": 1.3280000000000002e-05,
"loss": 0.7806,
"step": 166
},
{
"epoch": 0.0672,
"grad_norm": 1.2755334377288818,
"learning_rate": 1.3440000000000002e-05,
"loss": 0.2845,
"step": 168
},
{
"epoch": 0.068,
"grad_norm": 0.26743704080581665,
"learning_rate": 1.3600000000000002e-05,
"loss": 0.1663,
"step": 170
},
{
"epoch": 0.0688,
"grad_norm": 1.2172751426696777,
"learning_rate": 1.376e-05,
"loss": 0.3756,
"step": 172
},
{
"epoch": 0.0696,
"grad_norm": 3.7812864780426025,
"learning_rate": 1.392e-05,
"loss": 0.4978,
"step": 174
},
{
"epoch": 0.0704,
"grad_norm": 3.4768078327178955,
"learning_rate": 1.408e-05,
"loss": 0.9643,
"step": 176
},
{
"epoch": 0.0712,
"grad_norm": 3.9793436527252197,
"learning_rate": 1.4240000000000001e-05,
"loss": 0.158,
"step": 178
},
{
"epoch": 0.072,
"grad_norm": 2.2694666385650635,
"learning_rate": 1.4400000000000001e-05,
"loss": 0.881,
"step": 180
},
{
"epoch": 0.0728,
"grad_norm": 0.1139988824725151,
"learning_rate": 1.4560000000000001e-05,
"loss": 0.017,
"step": 182
},
{
"epoch": 0.0736,
"grad_norm": 2.4055392742156982,
"learning_rate": 1.4720000000000001e-05,
"loss": 0.1059,
"step": 184
},
{
"epoch": 0.0744,
"grad_norm": 1.4160810708999634,
"learning_rate": 1.4880000000000002e-05,
"loss": 0.1963,
"step": 186
},
{
"epoch": 0.0752,
"grad_norm": 1.9410396814346313,
"learning_rate": 1.5040000000000002e-05,
"loss": 0.1013,
"step": 188
},
{
"epoch": 0.076,
"grad_norm": 1.283490777015686,
"learning_rate": 1.5200000000000002e-05,
"loss": 0.2272,
"step": 190
},
{
"epoch": 0.0768,
"grad_norm": 1.230212688446045,
"learning_rate": 1.5360000000000002e-05,
"loss": 0.7391,
"step": 192
},
{
"epoch": 0.0776,
"grad_norm": 9.068771362304688,
"learning_rate": 1.552e-05,
"loss": 0.5673,
"step": 194
},
{
"epoch": 0.0784,
"grad_norm": 3.8310606479644775,
"learning_rate": 1.5680000000000002e-05,
"loss": 1.1112,
"step": 196
},
{
"epoch": 0.0792,
"grad_norm": 3.0830278396606445,
"learning_rate": 1.584e-05,
"loss": 0.711,
"step": 198
},
{
"epoch": 0.08,
"grad_norm": 8.513973236083984,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.9858,
"step": 200
},
{
"epoch": 0.0808,
"grad_norm": 0.3588366210460663,
"learning_rate": 1.616e-05,
"loss": 0.149,
"step": 202
},
{
"epoch": 0.0816,
"grad_norm": 1.0215955972671509,
"learning_rate": 1.632e-05,
"loss": 0.7344,
"step": 204
},
{
"epoch": 0.0824,
"grad_norm": 1.8482775688171387,
"learning_rate": 1.648e-05,
"loss": 1.0866,
"step": 206
},
{
"epoch": 0.0832,
"grad_norm": 0.7888800501823425,
"learning_rate": 1.664e-05,
"loss": 0.5496,
"step": 208
},
{
"epoch": 0.084,
"grad_norm": 0.33580031991004944,
"learning_rate": 1.6800000000000002e-05,
"loss": 0.2747,
"step": 210
},
{
"epoch": 0.0848,
"grad_norm": 1.4474271535873413,
"learning_rate": 1.696e-05,
"loss": 0.3155,
"step": 212
},
{
"epoch": 0.0856,
"grad_norm": 2.6667094230651855,
"learning_rate": 1.7120000000000002e-05,
"loss": 1.1201,
"step": 214
},
{
"epoch": 0.0864,
"grad_norm": 1.0721205472946167,
"learning_rate": 1.728e-05,
"loss": 0.3277,
"step": 216
},
{
"epoch": 0.0872,
"grad_norm": 0.2696588635444641,
"learning_rate": 1.7440000000000002e-05,
"loss": 0.5612,
"step": 218
},
{
"epoch": 0.088,
"grad_norm": 2.505485773086548,
"learning_rate": 1.76e-05,
"loss": 0.2816,
"step": 220
},
{
"epoch": 0.0888,
"grad_norm": 0.6796432137489319,
"learning_rate": 1.7760000000000003e-05,
"loss": 0.091,
"step": 222
},
{
"epoch": 0.0896,
"grad_norm": 0.23082202672958374,
"learning_rate": 1.792e-05,
"loss": 0.7718,
"step": 224
},
{
"epoch": 0.0904,
"grad_norm": 1.6087664365768433,
"learning_rate": 1.8080000000000003e-05,
"loss": 0.8094,
"step": 226
},
{
"epoch": 0.0912,
"grad_norm": 1.131905198097229,
"learning_rate": 1.824e-05,
"loss": 0.1958,
"step": 228
},
{
"epoch": 0.092,
"grad_norm": 1.3417667150497437,
"learning_rate": 1.8400000000000003e-05,
"loss": 0.0899,
"step": 230
},
{
"epoch": 0.0928,
"grad_norm": 1.7206881046295166,
"learning_rate": 1.8560000000000002e-05,
"loss": 0.1773,
"step": 232
},
{
"epoch": 0.0936,
"grad_norm": 1.3308982849121094,
"learning_rate": 1.8720000000000004e-05,
"loss": 0.6628,
"step": 234
},
{
"epoch": 0.0944,
"grad_norm": 0.06212015450000763,
"learning_rate": 1.8880000000000002e-05,
"loss": 0.3871,
"step": 236
},
{
"epoch": 0.0952,
"grad_norm": 2.4711997509002686,
"learning_rate": 1.904e-05,
"loss": 0.2319,
"step": 238
},
{
"epoch": 0.096,
"grad_norm": 3.6728742122650146,
"learning_rate": 1.9200000000000003e-05,
"loss": 0.5643,
"step": 240
},
{
"epoch": 0.0968,
"grad_norm": 0.1921927034854889,
"learning_rate": 1.936e-05,
"loss": 0.0931,
"step": 242
},
{
"epoch": 0.0976,
"grad_norm": 1.9750112295150757,
"learning_rate": 1.9520000000000003e-05,
"loss": 0.1632,
"step": 244
},
{
"epoch": 0.0984,
"grad_norm": 2.1710243225097656,
"learning_rate": 1.968e-05,
"loss": 2.1496,
"step": 246
},
{
"epoch": 0.0992,
"grad_norm": 0.6705166697502136,
"learning_rate": 1.9840000000000003e-05,
"loss": 0.2102,
"step": 248
},
{
"epoch": 0.1,
"grad_norm": 0.9208804368972778,
"learning_rate": 2e-05,
"loss": 0.363,
"step": 250
},
{
"epoch": 0.1008,
"grad_norm": 0.25396543741226196,
"learning_rate": 1.9999961008995607e-05,
"loss": 0.9275,
"step": 252
},
{
"epoch": 0.1016,
"grad_norm": 3.0983469486236572,
"learning_rate": 1.9999844036286483e-05,
"loss": 1.6902,
"step": 254
},
{
"epoch": 0.1024,
"grad_norm": 0.8619891405105591,
"learning_rate": 1.9999649082784807e-05,
"loss": 0.9756,
"step": 256
},
{
"epoch": 0.1032,
"grad_norm": 1.3311007022857666,
"learning_rate": 1.9999376150010868e-05,
"loss": 1.0115,
"step": 258
},
{
"epoch": 0.104,
"grad_norm": 1.6311815977096558,
"learning_rate": 1.9999025240093045e-05,
"loss": 1.3444,
"step": 260
},
{
"epoch": 0.1048,
"grad_norm": 1.097953200340271,
"learning_rate": 1.9998596355767805e-05,
"loss": 0.5742,
"step": 262
},
{
"epoch": 0.1056,
"grad_norm": 0.12066880613565445,
"learning_rate": 1.999808950037968e-05,
"loss": 0.1853,
"step": 264
},
{
"epoch": 0.1064,
"grad_norm": 1.4088952541351318,
"learning_rate": 1.9997504677881224e-05,
"loss": 0.585,
"step": 266
},
{
"epoch": 0.1072,
"grad_norm": 0.200229674577713,
"learning_rate": 1.9996841892833e-05,
"loss": 0.0933,
"step": 268
},
{
"epoch": 0.108,
"grad_norm": 1.871922254562378,
"learning_rate": 1.9996101150403543e-05,
"loss": 1.4538,
"step": 270
},
{
"epoch": 0.1088,
"grad_norm": 0.5457477569580078,
"learning_rate": 1.9995282456369313e-05,
"loss": 0.3239,
"step": 272
},
{
"epoch": 0.1096,
"grad_norm": 0.6347384452819824,
"learning_rate": 1.9994385817114644e-05,
"loss": 0.8208,
"step": 274
},
{
"epoch": 0.1104,
"grad_norm": 0.8534179925918579,
"learning_rate": 1.9993411239631713e-05,
"loss": 1.9646,
"step": 276
},
{
"epoch": 0.1112,
"grad_norm": 0.3291590213775635,
"learning_rate": 1.999235873152047e-05,
"loss": 0.0857,
"step": 278
},
{
"epoch": 0.112,
"grad_norm": 0.5380199551582336,
"learning_rate": 1.9991228300988586e-05,
"loss": 0.1706,
"step": 280
},
{
"epoch": 0.1128,
"grad_norm": 1.0301982164382935,
"learning_rate": 1.9990019956851384e-05,
"loss": 0.7458,
"step": 282
},
{
"epoch": 0.1136,
"grad_norm": 0.8207939863204956,
"learning_rate": 1.9988733708531772e-05,
"loss": 0.8598,
"step": 284
},
{
"epoch": 0.1144,
"grad_norm": 0.11704489588737488,
"learning_rate": 1.998736956606018e-05,
"loss": 0.0593,
"step": 286
},
{
"epoch": 0.1152,
"grad_norm": 0.8635666370391846,
"learning_rate": 1.9985927540074453e-05,
"loss": 0.4055,
"step": 288
},
{
"epoch": 0.116,
"grad_norm": 0.9759752154350281,
"learning_rate": 1.9984407641819812e-05,
"loss": 0.4182,
"step": 290
},
{
"epoch": 0.1168,
"grad_norm": 1.0212275981903076,
"learning_rate": 1.998280988314872e-05,
"loss": 0.3533,
"step": 292
},
{
"epoch": 0.1176,
"grad_norm": 0.36377808451652527,
"learning_rate": 1.9981134276520828e-05,
"loss": 0.1175,
"step": 294
},
{
"epoch": 0.1184,
"grad_norm": 1.5665260553359985,
"learning_rate": 1.9979380835002846e-05,
"loss": 0.2428,
"step": 296
},
{
"epoch": 0.1192,
"grad_norm": 1.0009765625,
"learning_rate": 1.997754957226847e-05,
"loss": 0.3959,
"step": 298
},
{
"epoch": 0.12,
"grad_norm": 0.43519535660743713,
"learning_rate": 1.9975640502598243e-05,
"loss": 0.3812,
"step": 300
},
{
"epoch": 0.1208,
"grad_norm": 1.0461794137954712,
"learning_rate": 1.9973653640879486e-05,
"loss": 0.5793,
"step": 302
},
{
"epoch": 0.1216,
"grad_norm": 1.0484498739242554,
"learning_rate": 1.997158900260614e-05,
"loss": 0.3245,
"step": 304
},
{
"epoch": 0.1224,
"grad_norm": 0.18130800127983093,
"learning_rate": 1.9969446603878673e-05,
"loss": 0.0716,
"step": 306
},
{
"epoch": 0.1232,
"grad_norm": 0.14019177854061127,
"learning_rate": 1.9967226461403934e-05,
"loss": 0.0946,
"step": 308
},
{
"epoch": 0.124,
"grad_norm": 0.48715105652809143,
"learning_rate": 1.9964928592495046e-05,
"loss": 0.1536,
"step": 310
},
{
"epoch": 0.1248,
"grad_norm": 0.4148877263069153,
"learning_rate": 1.996255301507125e-05,
"loss": 0.1165,
"step": 312
},
{
"epoch": 0.1256,
"grad_norm": 0.0783982053399086,
"learning_rate": 1.9960099747657774e-05,
"loss": 0.0288,
"step": 314
},
{
"epoch": 0.1264,
"grad_norm": 4.088832378387451,
"learning_rate": 1.9957568809385693e-05,
"loss": 1.1324,
"step": 316
},
{
"epoch": 0.1272,
"grad_norm": 0.10389735549688339,
"learning_rate": 1.995496021999177e-05,
"loss": 0.0405,
"step": 318
},
{
"epoch": 0.128,
"grad_norm": 0.04948841407895088,
"learning_rate": 1.9952273999818312e-05,
"loss": 0.0372,
"step": 320
},
{
"epoch": 0.1288,
"grad_norm": 0.09340821206569672,
"learning_rate": 1.9949510169813006e-05,
"loss": 0.5878,
"step": 322
},
{
"epoch": 0.1296,
"grad_norm": 1.2787668704986572,
"learning_rate": 1.9946668751528745e-05,
"loss": 0.3489,
"step": 324
},
{
"epoch": 0.1304,
"grad_norm": 1.642128586769104,
"learning_rate": 1.994374976712348e-05,
"loss": 0.1129,
"step": 326
},
{
"epoch": 0.1312,
"grad_norm": 0.1789906769990921,
"learning_rate": 1.9940753239360047e-05,
"loss": 0.0982,
"step": 328
},
{
"epoch": 0.132,
"grad_norm": 1.3861182928085327,
"learning_rate": 1.9937679191605964e-05,
"loss": 0.2423,
"step": 330
},
{
"epoch": 0.1328,
"grad_norm": 0.9052829146385193,
"learning_rate": 1.9934527647833276e-05,
"loss": 0.1399,
"step": 332
},
{
"epoch": 0.1336,
"grad_norm": 1.6789326667785645,
"learning_rate": 1.9931298632618355e-05,
"loss": 0.2957,
"step": 334
},
{
"epoch": 0.1344,
"grad_norm": 1.9615066051483154,
"learning_rate": 1.9927992171141707e-05,
"loss": 1.2319,
"step": 336
},
{
"epoch": 0.1352,
"grad_norm": 0.2719954550266266,
"learning_rate": 1.9924608289187786e-05,
"loss": 0.3792,
"step": 338
},
{
"epoch": 0.136,
"grad_norm": 0.131216362118721,
"learning_rate": 1.9921147013144782e-05,
"loss": 0.0956,
"step": 340
},
{
"epoch": 0.1368,
"grad_norm": 0.2020615190267563,
"learning_rate": 1.9917608370004417e-05,
"loss": 0.0629,
"step": 342
},
{
"epoch": 0.1376,
"grad_norm": 0.49078741669654846,
"learning_rate": 1.9913992387361747e-05,
"loss": 0.096,
"step": 344
},
{
"epoch": 0.1384,
"grad_norm": 0.09738267213106155,
"learning_rate": 1.991029909341493e-05,
"loss": 0.3201,
"step": 346
},
{
"epoch": 0.1392,
"grad_norm": 3.8677468299865723,
"learning_rate": 1.990652851696501e-05,
"loss": 0.1698,
"step": 348
},
{
"epoch": 0.14,
"grad_norm": 0.10318366438150406,
"learning_rate": 1.9902680687415704e-05,
"loss": 0.2468,
"step": 350
},
{
"epoch": 0.1408,
"grad_norm": 1.7756882905960083,
"learning_rate": 1.989875563477316e-05,
"loss": 0.3846,
"step": 352
},
{
"epoch": 0.1416,
"grad_norm": 8.583524703979492,
"learning_rate": 1.9894753389645723e-05,
"loss": 0.565,
"step": 354
},
{
"epoch": 0.1424,
"grad_norm": 0.26608502864837646,
"learning_rate": 1.9890673983243708e-05,
"loss": 0.059,
"step": 356
},
{
"epoch": 0.1432,
"grad_norm": 1.6263763904571533,
"learning_rate": 1.988651744737914e-05,
"loss": 0.9423,
"step": 358
},
{
"epoch": 0.144,
"grad_norm": 1.3715531826019287,
"learning_rate": 1.988228381446553e-05,
"loss": 0.2875,
"step": 360
},
{
"epoch": 0.1448,
"grad_norm": 1.6036182641983032,
"learning_rate": 1.987797311751759e-05,
"loss": 0.6963,
"step": 362
},
{
"epoch": 0.1456,
"grad_norm": 0.44622594118118286,
"learning_rate": 1.9873585390151003e-05,
"loss": 0.1357,
"step": 364
},
{
"epoch": 0.1464,
"grad_norm": 0.3719693720340729,
"learning_rate": 1.9869120666582153e-05,
"loss": 0.0775,
"step": 366
},
{
"epoch": 0.1472,
"grad_norm": 1.655290961265564,
"learning_rate": 1.9864578981627844e-05,
"loss": 0.316,
"step": 368
},
{
"epoch": 0.148,
"grad_norm": 0.5241470336914062,
"learning_rate": 1.985996037070505e-05,
"loss": 0.1641,
"step": 370
},
{
"epoch": 0.1488,
"grad_norm": 0.0686238631606102,
"learning_rate": 1.985526486983063e-05,
"loss": 0.3246,
"step": 372
},
{
"epoch": 0.1496,
"grad_norm": 0.04877380654215813,
"learning_rate": 1.9850492515621038e-05,
"loss": 0.0454,
"step": 374
},
{
"epoch": 0.1504,
"grad_norm": 0.033756960183382034,
"learning_rate": 1.9845643345292055e-05,
"loss": 0.0306,
"step": 376
},
{
"epoch": 0.1512,
"grad_norm": 0.050034862011671066,
"learning_rate": 1.9840717396658483e-05,
"loss": 0.0198,
"step": 378
},
{
"epoch": 0.152,
"grad_norm": 1.1677488088607788,
"learning_rate": 1.983571470813386e-05,
"loss": 0.9779,
"step": 380
},
{
"epoch": 0.1528,
"grad_norm": 2.3200299739837646,
"learning_rate": 1.9830635318730155e-05,
"loss": 0.2013,
"step": 382
},
{
"epoch": 0.1536,
"grad_norm": 0.37079155445098877,
"learning_rate": 1.982547926805747e-05,
"loss": 0.0332,
"step": 384
},
{
"epoch": 0.1544,
"grad_norm": 0.03381960466504097,
"learning_rate": 1.982024659632372e-05,
"loss": 0.0156,
"step": 386
},
{
"epoch": 0.1552,
"grad_norm": 1.5234084129333496,
"learning_rate": 1.981493734433433e-05,
"loss": 0.0724,
"step": 388
},
{
"epoch": 0.156,
"grad_norm": 1.1328762769699097,
"learning_rate": 1.9809551553491918e-05,
"loss": 0.955,
"step": 390
},
{
"epoch": 0.1568,
"grad_norm": 4.6101975440979,
"learning_rate": 1.980408926579596e-05,
"loss": 1.726,
"step": 392
},
{
"epoch": 0.1576,
"grad_norm": 0.9627692103385925,
"learning_rate": 1.979855052384247e-05,
"loss": 0.4459,
"step": 394
},
{
"epoch": 0.1584,
"grad_norm": 0.19565686583518982,
"learning_rate": 1.9792935370823676e-05,
"loss": 0.1905,
"step": 396
},
{
"epoch": 0.1592,
"grad_norm": 1.6848326921463013,
"learning_rate": 1.9787243850527663e-05,
"loss": 0.7807,
"step": 398
},
{
"epoch": 0.16,
"grad_norm": 1.4652438163757324,
"learning_rate": 1.9781476007338058e-05,
"loss": 0.4939,
"step": 400
},
{
"epoch": 0.1608,
"grad_norm": 0.5617023706436157,
"learning_rate": 1.9775631886233655e-05,
"loss": 0.5734,
"step": 402
},
{
"epoch": 0.1616,
"grad_norm": 0.37801074981689453,
"learning_rate": 1.9769711532788083e-05,
"loss": 0.2663,
"step": 404
},
{
"epoch": 0.1624,
"grad_norm": 2.0784738063812256,
"learning_rate": 1.976371499316945e-05,
"loss": 0.7156,
"step": 406
},
{
"epoch": 0.1632,
"grad_norm": 0.24609781801700592,
"learning_rate": 1.9757642314139977e-05,
"loss": 0.1335,
"step": 408
},
{
"epoch": 0.164,
"grad_norm": 0.18215598165988922,
"learning_rate": 1.9751493543055634e-05,
"loss": 0.1607,
"step": 410
},
{
"epoch": 0.1648,
"grad_norm": 0.1584533452987671,
"learning_rate": 1.9745268727865774e-05,
"loss": 0.1947,
"step": 412
},
{
"epoch": 0.1656,
"grad_norm": 0.1543274223804474,
"learning_rate": 1.9738967917112752e-05,
"loss": 0.0954,
"step": 414
},
{
"epoch": 0.1664,
"grad_norm": 0.09421176463365555,
"learning_rate": 1.9732591159931564e-05,
"loss": 0.058,
"step": 416
},
{
"epoch": 0.1672,
"grad_norm": 0.037417419254779816,
"learning_rate": 1.9726138506049438e-05,
"loss": 0.0586,
"step": 418
},
{
"epoch": 0.168,
"grad_norm": 1.3091963529586792,
"learning_rate": 1.9719610005785466e-05,
"loss": 0.5727,
"step": 420
},
{
"epoch": 0.1688,
"grad_norm": 0.04180837422609329,
"learning_rate": 1.9713005710050203e-05,
"loss": 0.0303,
"step": 422
},
{
"epoch": 0.1696,
"grad_norm": 0.11628637462854385,
"learning_rate": 1.9706325670345276e-05,
"loss": 0.0236,
"step": 424
},
{
"epoch": 0.1704,
"grad_norm": 0.028453992679715157,
"learning_rate": 1.9699569938762975e-05,
"loss": 0.015,
"step": 426
},
{
"epoch": 0.1712,
"grad_norm": 1.2765767574310303,
"learning_rate": 1.9692738567985853e-05,
"loss": 0.6437,
"step": 428
},
{
"epoch": 0.172,
"grad_norm": 0.6149804592132568,
"learning_rate": 1.9685831611286312e-05,
"loss": 0.0557,
"step": 430
},
{
"epoch": 0.1728,
"grad_norm": 0.0872359424829483,
"learning_rate": 1.967884912252619e-05,
"loss": 0.1832,
"step": 432
},
{
"epoch": 0.1736,
"grad_norm": 0.031350962817668915,
"learning_rate": 1.967179115615633e-05,
"loss": 0.0422,
"step": 434
},
{
"epoch": 0.1744,
"grad_norm": 0.1513989269733429,
"learning_rate": 1.9664657767216176e-05,
"loss": 0.033,
"step": 436
},
{
"epoch": 0.1752,
"grad_norm": 0.05188186839222908,
"learning_rate": 1.9657449011333328e-05,
"loss": 0.0161,
"step": 438
},
{
"epoch": 0.176,
"grad_norm": 0.20884352922439575,
"learning_rate": 1.9650164944723116e-05,
"loss": 0.3403,
"step": 440
},
{
"epoch": 0.1768,
"grad_norm": 1.4495282173156738,
"learning_rate": 1.964280562418815e-05,
"loss": 0.1676,
"step": 442
},
{
"epoch": 0.1776,
"grad_norm": 0.01325430627912283,
"learning_rate": 1.963537110711789e-05,
"loss": 0.0142,
"step": 444
},
{
"epoch": 0.1784,
"grad_norm": 0.03731897845864296,
"learning_rate": 1.962786145148819e-05,
"loss": 0.013,
"step": 446
},
{
"epoch": 0.1792,
"grad_norm": 0.06609856337308884,
"learning_rate": 1.962027671586086e-05,
"loss": 0.0124,
"step": 448
},
{
"epoch": 0.18,
"grad_norm": 0.02560417540371418,
"learning_rate": 1.961261695938319e-05,
"loss": 0.0291,
"step": 450
},
{
"epoch": 0.1808,
"grad_norm": 0.016771750524640083,
"learning_rate": 1.96048822417875e-05,
"loss": 0.0089,
"step": 452
},
{
"epoch": 0.1816,
"grad_norm": 7.128602504730225,
"learning_rate": 1.9597072623390668e-05,
"loss": 1.0882,
"step": 454
},
{
"epoch": 0.1824,
"grad_norm": 0.03642188385128975,
"learning_rate": 1.958918816509367e-05,
"loss": 0.0113,
"step": 456
},
{
"epoch": 0.1832,
"grad_norm": 0.09811338037252426,
"learning_rate": 1.95812289283811e-05,
"loss": 0.016,
"step": 458
},
{
"epoch": 0.184,
"grad_norm": 0.04652848094701767,
"learning_rate": 1.9573194975320672e-05,
"loss": 0.1104,
"step": 460
},
{
"epoch": 0.1848,
"grad_norm": 1.6137455701828003,
"learning_rate": 1.956508636856278e-05,
"loss": 0.872,
"step": 462
},
{
"epoch": 0.1856,
"grad_norm": 1.2847460508346558,
"learning_rate": 1.9556903171339963e-05,
"loss": 0.9774,
"step": 464
},
{
"epoch": 0.1864,
"grad_norm": 0.3364383280277252,
"learning_rate": 1.9548645447466433e-05,
"loss": 0.0612,
"step": 466
},
{
"epoch": 0.1872,
"grad_norm": 1.6685212850570679,
"learning_rate": 1.954031326133758e-05,
"loss": 0.4306,
"step": 468
},
{
"epoch": 0.188,
"grad_norm": 0.12480071187019348,
"learning_rate": 1.9531906677929472e-05,
"loss": 0.1104,
"step": 470
},
{
"epoch": 0.1888,
"grad_norm": 0.5850446224212646,
"learning_rate": 1.9523425762798328e-05,
"loss": 0.1902,
"step": 472
},
{
"epoch": 0.1896,
"grad_norm": 0.14481854438781738,
"learning_rate": 1.951487058208003e-05,
"loss": 0.1135,
"step": 474
},
{
"epoch": 0.1904,
"grad_norm": 1.4755823612213135,
"learning_rate": 1.95062412024896e-05,
"loss": 0.9318,
"step": 476
},
{
"epoch": 0.1912,
"grad_norm": 0.31880441308021545,
"learning_rate": 1.949753769132067e-05,
"loss": 0.0736,
"step": 478
},
{
"epoch": 0.192,
"grad_norm": 0.3856407403945923,
"learning_rate": 1.9488760116444966e-05,
"loss": 0.4857,
"step": 480
},
{
"epoch": 0.1928,
"grad_norm": 1.6081260442733765,
"learning_rate": 1.9479908546311783e-05,
"loss": 0.2605,
"step": 482
},
{
"epoch": 0.1936,
"grad_norm": 1.051400065422058,
"learning_rate": 1.9470983049947446e-05,
"loss": 0.1468,
"step": 484
},
{
"epoch": 0.1944,
"grad_norm": 0.19811314344406128,
"learning_rate": 1.946198369695476e-05,
"loss": 0.5758,
"step": 486
},
{
"epoch": 0.1952,
"grad_norm": 0.7101706862449646,
"learning_rate": 1.9452910557512497e-05,
"loss": 0.765,
"step": 488
},
{
"epoch": 0.196,
"grad_norm": 1.4943866729736328,
"learning_rate": 1.944376370237481e-05,
"loss": 0.4381,
"step": 490
},
{
"epoch": 0.1968,
"grad_norm": 0.22989702224731445,
"learning_rate": 1.9434543202870726e-05,
"loss": 0.0615,
"step": 492
},
{
"epoch": 0.1976,
"grad_norm": 1.0281144380569458,
"learning_rate": 1.9425249130903544e-05,
"loss": 0.2893,
"step": 494
},
{
"epoch": 0.1984,
"grad_norm": 0.912899911403656,
"learning_rate": 1.9415881558950302e-05,
"loss": 0.1344,
"step": 496
},
{
"epoch": 0.1992,
"grad_norm": 0.4319363534450531,
"learning_rate": 1.9406440560061214e-05,
"loss": 0.162,
"step": 498
},
{
"epoch": 0.2,
"grad_norm": 1.5313297510147095,
"learning_rate": 1.9396926207859085e-05,
"loss": 0.9079,
"step": 500
},
{
"epoch": 0.2008,
"grad_norm": 0.1665397733449936,
"learning_rate": 1.9387338576538743e-05,
"loss": 0.0523,
"step": 502
},
{
"epoch": 0.2016,
"grad_norm": 0.703409731388092,
"learning_rate": 1.937767774086646e-05,
"loss": 0.4771,
"step": 504
},
{
"epoch": 0.2024,
"grad_norm": 0.13862833380699158,
"learning_rate": 1.936794377617938e-05,
"loss": 0.0484,
"step": 506
},
{
"epoch": 0.2032,
"grad_norm": 0.12500865757465363,
"learning_rate": 1.935813675838491e-05,
"loss": 0.033,
"step": 508
},
{
"epoch": 0.204,
"grad_norm": 0.12923550605773926,
"learning_rate": 1.9348256763960146e-05,
"loss": 0.0503,
"step": 510
},
{
"epoch": 0.2048,
"grad_norm": 0.12034843116998672,
"learning_rate": 1.933830386995127e-05,
"loss": 0.2762,
"step": 512
},
{
"epoch": 0.2056,
"grad_norm": 1.1866302490234375,
"learning_rate": 1.9328278153972947e-05,
"loss": 0.0998,
"step": 514
},
{
"epoch": 0.2064,
"grad_norm": 0.048314038664102554,
"learning_rate": 1.9318179694207726e-05,
"loss": 0.0296,
"step": 516
},
{
"epoch": 0.2072,
"grad_norm": 0.2854752242565155,
"learning_rate": 1.9308008569405424e-05,
"loss": 0.0852,
"step": 518
},
{
"epoch": 0.208,
"grad_norm": 0.16209827363491058,
"learning_rate": 1.9297764858882516e-05,
"loss": 0.8268,
"step": 520
},
{
"epoch": 0.2088,
"grad_norm": 0.01691954769194126,
"learning_rate": 1.9287448642521513e-05,
"loss": 0.0093,
"step": 522
},
{
"epoch": 0.2096,
"grad_norm": 0.09849057346582413,
"learning_rate": 1.9277060000770342e-05,
"loss": 0.0657,
"step": 524
},
{
"epoch": 0.2104,
"grad_norm": 0.05499891936779022,
"learning_rate": 1.9266599014641724e-05,
"loss": 0.0259,
"step": 526
},
{
"epoch": 0.2112,
"grad_norm": 0.5297953486442566,
"learning_rate": 1.9256065765712524e-05,
"loss": 0.0539,
"step": 528
},
{
"epoch": 0.212,
"grad_norm": 0.053190361708402634,
"learning_rate": 1.9245460336123136e-05,
"loss": 0.035,
"step": 530
},
{
"epoch": 0.2128,
"grad_norm": 0.7372400760650635,
"learning_rate": 1.9234782808576823e-05,
"loss": 0.186,
"step": 532
},
{
"epoch": 0.2136,
"grad_norm": 0.02464686520397663,
"learning_rate": 1.9224033266339103e-05,
"loss": 0.0097,
"step": 534
},
{
"epoch": 0.2144,
"grad_norm": 3.046816825866699,
"learning_rate": 1.9213211793237056e-05,
"loss": 0.1614,
"step": 536
},
{
"epoch": 0.2152,
"grad_norm": 2.2198116779327393,
"learning_rate": 1.9202318473658707e-05,
"loss": 0.6094,
"step": 538
},
{
"epoch": 0.216,
"grad_norm": 1.2823199033737183,
"learning_rate": 1.9191353392552346e-05,
"loss": 0.1261,
"step": 540
},
{
"epoch": 0.2168,
"grad_norm": 0.019597092643380165,
"learning_rate": 1.9180316635425883e-05,
"loss": 0.4845,
"step": 542
},
{
"epoch": 0.2176,
"grad_norm": 0.011679578572511673,
"learning_rate": 1.9169208288346168e-05,
"loss": 0.0162,
"step": 544
},
{
"epoch": 0.2184,
"grad_norm": 1.827360987663269,
"learning_rate": 1.9158028437938316e-05,
"loss": 0.8432,
"step": 546
},
{
"epoch": 0.2192,
"grad_norm": 1.7069369554519653,
"learning_rate": 1.914677717138505e-05,
"loss": 0.321,
"step": 548
},
{
"epoch": 0.22,
"grad_norm": 0.022664044052362442,
"learning_rate": 1.913545457642601e-05,
"loss": 0.2938,
"step": 550
},
{
"epoch": 0.2208,
"grad_norm": 0.5914615392684937,
"learning_rate": 1.9124060741357065e-05,
"loss": 0.0618,
"step": 552
},
{
"epoch": 0.2216,
"grad_norm": 0.0440104641020298,
"learning_rate": 1.9112595755029625e-05,
"loss": 0.0429,
"step": 554
},
{
"epoch": 0.2224,
"grad_norm": 0.263875812292099,
"learning_rate": 1.9101059706849957e-05,
"loss": 1.002,
"step": 556
},
{
"epoch": 0.2232,
"grad_norm": 1.789616584777832,
"learning_rate": 1.908945268677849e-05,
"loss": 0.3908,
"step": 558
},
{
"epoch": 0.224,
"grad_norm": 3.8418707847595215,
"learning_rate": 1.907777478532909e-05,
"loss": 0.4097,
"step": 560
},
{
"epoch": 0.2248,
"grad_norm": 0.5954731702804565,
"learning_rate": 1.906602609356838e-05,
"loss": 0.1258,
"step": 562
},
{
"epoch": 0.2256,
"grad_norm": 3.1581459045410156,
"learning_rate": 1.905420670311502e-05,
"loss": 0.3185,
"step": 564
},
{
"epoch": 0.2264,
"grad_norm": 0.37356865406036377,
"learning_rate": 1.9042316706138987e-05,
"loss": 0.0718,
"step": 566
},
{
"epoch": 0.2272,
"grad_norm": 0.6604268550872803,
"learning_rate": 1.9030356195360875e-05,
"loss": 0.1169,
"step": 568
},
{
"epoch": 0.228,
"grad_norm": 1.148862361907959,
"learning_rate": 1.901832526405114e-05,
"loss": 0.1075,
"step": 570
},
{
"epoch": 0.2288,
"grad_norm": 4.533411026000977,
"learning_rate": 1.9006224006029404e-05,
"loss": 0.8169,
"step": 572
},
{
"epoch": 0.2296,
"grad_norm": 0.1384696662425995,
"learning_rate": 1.899405251566371e-05,
"loss": 0.0574,
"step": 574
},
{
"epoch": 0.2304,
"grad_norm": 4.587916851043701,
"learning_rate": 1.8981810887869784e-05,
"loss": 0.3013,
"step": 576
},
{
"epoch": 0.2312,
"grad_norm": 0.07302756607532501,
"learning_rate": 1.8969499218110302e-05,
"loss": 0.0345,
"step": 578
},
{
"epoch": 0.232,
"grad_norm": 0.9006990194320679,
"learning_rate": 1.895711760239413e-05,
"loss": 0.4378,
"step": 580
},
{
"epoch": 0.2328,
"grad_norm": 0.07447630912065506,
"learning_rate": 1.89446661372756e-05,
"loss": 0.11,
"step": 582
},
{
"epoch": 0.2336,
"grad_norm": 0.10830947756767273,
"learning_rate": 1.893214491985374e-05,
"loss": 0.0505,
"step": 584
},
{
"epoch": 0.2344,
"grad_norm": 0.0925409272313118,
"learning_rate": 1.8919554047771508e-05,
"loss": 0.0449,
"step": 586
},
{
"epoch": 0.2352,
"grad_norm": 2.633633613586426,
"learning_rate": 1.890689361921507e-05,
"loss": 0.7599,
"step": 588
},
{
"epoch": 0.236,
"grad_norm": 0.013573708944022655,
"learning_rate": 1.889416373291298e-05,
"loss": 0.2034,
"step": 590
},
{
"epoch": 0.2368,
"grad_norm": 0.03778607025742531,
"learning_rate": 1.8881364488135448e-05,
"loss": 0.5998,
"step": 592
},
{
"epoch": 0.2376,
"grad_norm": 0.4637905955314636,
"learning_rate": 1.886849598469356e-05,
"loss": 0.0662,
"step": 594
},
{
"epoch": 0.2384,
"grad_norm": 0.058186739683151245,
"learning_rate": 1.8855558322938492e-05,
"loss": 0.2836,
"step": 596
},
{
"epoch": 0.2392,
"grad_norm": 0.09846732765436172,
"learning_rate": 1.8842551603760725e-05,
"loss": 0.1087,
"step": 598
},
{
"epoch": 0.24,
"grad_norm": 0.21428197622299194,
"learning_rate": 1.8829475928589272e-05,
"loss": 0.229,
"step": 600
},
{
"epoch": 0.2408,
"grad_norm": 0.3262503743171692,
"learning_rate": 1.881633139939087e-05,
"loss": 0.104,
"step": 602
},
{
"epoch": 0.2416,
"grad_norm": 0.9335662126541138,
"learning_rate": 1.8803118118669203e-05,
"loss": 0.5139,
"step": 604
},
{
"epoch": 0.2424,
"grad_norm": 0.027328329160809517,
"learning_rate": 1.878983618946409e-05,
"loss": 0.0222,
"step": 606
},
{
"epoch": 0.2432,
"grad_norm": 0.048500385135412216,
"learning_rate": 1.8776485715350672e-05,
"loss": 0.0422,
"step": 608
},
{
"epoch": 0.244,
"grad_norm": 0.3864876925945282,
"learning_rate": 1.8763066800438638e-05,
"loss": 0.1505,
"step": 610
},
{
"epoch": 0.2448,
"grad_norm": 1.1042886972427368,
"learning_rate": 1.874957954937138e-05,
"loss": 0.6224,
"step": 612
},
{
"epoch": 0.2456,
"grad_norm": 0.10346169769763947,
"learning_rate": 1.8736024067325188e-05,
"loss": 0.0375,
"step": 614
},
{
"epoch": 0.2464,
"grad_norm": 0.01699395291507244,
"learning_rate": 1.8722400460008437e-05,
"loss": 0.0352,
"step": 616
},
{
"epoch": 0.2472,
"grad_norm": 1.2863647937774658,
"learning_rate": 1.8708708833660755e-05,
"loss": 0.5494,
"step": 618
},
{
"epoch": 0.248,
"grad_norm": 2.1446785926818848,
"learning_rate": 1.869494929505219e-05,
"loss": 0.3612,
"step": 620
},
{
"epoch": 0.2488,
"grad_norm": 0.16956354677677155,
"learning_rate": 1.8681121951482397e-05,
"loss": 0.0355,
"step": 622
},
{
"epoch": 0.2496,
"grad_norm": 1.0704134702682495,
"learning_rate": 1.8667226910779767e-05,
"loss": 0.3591,
"step": 624
},
{
"epoch": 0.2504,
"grad_norm": 0.17657175660133362,
"learning_rate": 1.8653264281300622e-05,
"loss": 0.0436,
"step": 626
},
{
"epoch": 0.2512,
"grad_norm": 0.16666510701179504,
"learning_rate": 1.8639234171928355e-05,
"loss": 0.0334,
"step": 628
},
{
"epoch": 0.252,
"grad_norm": 0.14208954572677612,
"learning_rate": 1.8625136692072577e-05,
"loss": 0.2292,
"step": 630
},
{
"epoch": 0.2528,
"grad_norm": 1.2962373495101929,
"learning_rate": 1.8610971951668265e-05,
"loss": 0.2592,
"step": 632
},
{
"epoch": 0.2536,
"grad_norm": 0.09842484444379807,
"learning_rate": 1.8596740061174912e-05,
"loss": 0.0721,
"step": 634
},
{
"epoch": 0.2544,
"grad_norm": 0.07477198541164398,
"learning_rate": 1.8582441131575658e-05,
"loss": 0.0468,
"step": 636
},
{
"epoch": 0.2552,
"grad_norm": 1.2548961639404297,
"learning_rate": 1.856807527437643e-05,
"loss": 0.3335,
"step": 638
},
{
"epoch": 0.256,
"grad_norm": 0.20232835412025452,
"learning_rate": 1.855364260160507e-05,
"loss": 0.4149,
"step": 640
},
{
"epoch": 0.2568,
"grad_norm": 0.03189453110098839,
"learning_rate": 1.8539143225810453e-05,
"loss": 0.174,
"step": 642
},
{
"epoch": 0.2576,
"grad_norm": 0.026663975790143013,
"learning_rate": 1.8524577260061628e-05,
"loss": 0.0207,
"step": 644
},
{
"epoch": 0.2584,
"grad_norm": 0.03870954364538193,
"learning_rate": 1.850994481794692e-05,
"loss": 0.0197,
"step": 646
},
{
"epoch": 0.2592,
"grad_norm": 0.05131769925355911,
"learning_rate": 1.8495246013573057e-05,
"loss": 0.0187,
"step": 648
},
{
"epoch": 0.26,
"grad_norm": 0.278886079788208,
"learning_rate": 1.848048096156426e-05,
"loss": 0.1129,
"step": 650
},
{
"epoch": 0.2608,
"grad_norm": 0.030908726155757904,
"learning_rate": 1.8465649777061377e-05,
"loss": 0.0288,
"step": 652
},
{
"epoch": 0.2616,
"grad_norm": 1.7882804870605469,
"learning_rate": 1.8450752575720967e-05,
"loss": 0.3299,
"step": 654
},
{
"epoch": 0.2624,
"grad_norm": 0.9408004879951477,
"learning_rate": 1.843578947371439e-05,
"loss": 0.1842,
"step": 656
},
{
"epoch": 0.2632,
"grad_norm": 0.03317919746041298,
"learning_rate": 1.8420760587726925e-05,
"loss": 0.0249,
"step": 658
},
{
"epoch": 0.264,
"grad_norm": 0.0062853083945810795,
"learning_rate": 1.8405666034956842e-05,
"loss": 0.0765,
"step": 660
},
{
"epoch": 0.2648,
"grad_norm": 1.696451187133789,
"learning_rate": 1.8390505933114503e-05,
"loss": 0.4928,
"step": 662
},
{
"epoch": 0.2656,
"grad_norm": 0.11740172654390335,
"learning_rate": 1.837528040042142e-05,
"loss": 0.0175,
"step": 664
},
{
"epoch": 0.2664,
"grad_norm": 0.437165230512619,
"learning_rate": 1.8359989555609355e-05,
"loss": 0.0775,
"step": 666
},
{
"epoch": 0.2672,
"grad_norm": 0.01961176097393036,
"learning_rate": 1.834463351791939e-05,
"loss": 0.012,
"step": 668
},
{
"epoch": 0.268,
"grad_norm": 2.6758432388305664,
"learning_rate": 1.8329212407100996e-05,
"loss": 0.1287,
"step": 670
},
{
"epoch": 0.2688,
"grad_norm": 0.007790145929902792,
"learning_rate": 1.8313726343411085e-05,
"loss": 0.0071,
"step": 672
},
{
"epoch": 0.2696,
"grad_norm": 0.01610707677900791,
"learning_rate": 1.82981754476131e-05,
"loss": 0.004,
"step": 674
},
{
"epoch": 0.2704,
"grad_norm": 0.10419867932796478,
"learning_rate": 1.8282559840976043e-05,
"loss": 0.022,
"step": 676
},
{
"epoch": 0.2712,
"grad_norm": 0.020517872646450996,
"learning_rate": 1.8266879645273557e-05,
"loss": 0.0028,
"step": 678
},
{
"epoch": 0.272,
"grad_norm": 0.011518670246005058,
"learning_rate": 1.8251134982782952e-05,
"loss": 0.0121,
"step": 680
},
{
"epoch": 0.2728,
"grad_norm": 3.1481823921203613,
"learning_rate": 1.8235325976284276e-05,
"loss": 0.9131,
"step": 682
},
{
"epoch": 0.2736,
"grad_norm": 0.021789977326989174,
"learning_rate": 1.8219452749059332e-05,
"loss": 0.0169,
"step": 684
},
{
"epoch": 0.2744,
"grad_norm": 2.266319990158081,
"learning_rate": 1.8203515424890738e-05,
"loss": 0.4778,
"step": 686
},
{
"epoch": 0.2752,
"grad_norm": 2.55071759223938,
"learning_rate": 1.8187514128060946e-05,
"loss": 0.4595,
"step": 688
},
{
"epoch": 0.276,
"grad_norm": 1.0234826803207397,
"learning_rate": 1.8171448983351284e-05,
"loss": 0.6759,
"step": 690
},
{
"epoch": 0.2768,
"grad_norm": 0.5817314982414246,
"learning_rate": 1.8155320116040983e-05,
"loss": 0.2023,
"step": 692
},
{
"epoch": 0.2776,
"grad_norm": 0.07609284669160843,
"learning_rate": 1.8139127651906183e-05,
"loss": 0.2312,
"step": 694
},
{
"epoch": 0.2784,
"grad_norm": 0.1080310121178627,
"learning_rate": 1.812287171721897e-05,
"loss": 0.1537,
"step": 696
},
{
"epoch": 0.2792,
"grad_norm": 0.1361207813024521,
"learning_rate": 1.81065524387464e-05,
"loss": 0.0511,
"step": 698
},
{
"epoch": 0.28,
"grad_norm": 0.11058028042316437,
"learning_rate": 1.8090169943749477e-05,
"loss": 0.0885,
"step": 700
},
{
"epoch": 0.2808,
"grad_norm": 0.6568836569786072,
"learning_rate": 1.8073724359982184e-05,
"loss": 0.3678,
"step": 702
},
{
"epoch": 0.2816,
"grad_norm": 0.250731885433197,
"learning_rate": 1.8057215815690494e-05,
"loss": 0.5069,
"step": 704
},
{
"epoch": 0.2824,
"grad_norm": 0.0678286999464035,
"learning_rate": 1.8040644439611348e-05,
"loss": 0.0418,
"step": 706
},
{
"epoch": 0.2832,
"grad_norm": 1.9648799896240234,
"learning_rate": 1.802401036097167e-05,
"loss": 0.4493,
"step": 708
},
{
"epoch": 0.284,
"grad_norm": 0.21381056308746338,
"learning_rate": 1.8007313709487334e-05,
"loss": 0.1236,
"step": 710
},
{
"epoch": 0.2848,
"grad_norm": 0.5156528949737549,
"learning_rate": 1.79905546153622e-05,
"loss": 0.1819,
"step": 712
},
{
"epoch": 0.2856,
"grad_norm": 0.09592005610466003,
"learning_rate": 1.7973733209287036e-05,
"loss": 0.1251,
"step": 714
},
{
"epoch": 0.2864,
"grad_norm": 0.42804062366485596,
"learning_rate": 1.7956849622438554e-05,
"loss": 0.1349,
"step": 716
},
{
"epoch": 0.2872,
"grad_norm": 0.13698697090148926,
"learning_rate": 1.7939903986478354e-05,
"loss": 0.0416,
"step": 718
},
{
"epoch": 0.288,
"grad_norm": 0.01778518594801426,
"learning_rate": 1.792289643355191e-05,
"loss": 0.0129,
"step": 720
},
{
"epoch": 0.2888,
"grad_norm": 0.2408394068479538,
"learning_rate": 1.7905827096287532e-05,
"loss": 0.0963,
"step": 722
},
{
"epoch": 0.2896,
"grad_norm": 0.025463353842496872,
"learning_rate": 1.7888696107795343e-05,
"loss": 0.6419,
"step": 724
},
{
"epoch": 0.2904,
"grad_norm": 1.5628716945648193,
"learning_rate": 1.7871503601666233e-05,
"loss": 0.4731,
"step": 726
},
{
"epoch": 0.2912,
"grad_norm": 0.07346764206886292,
"learning_rate": 1.785424971197082e-05,
"loss": 0.0206,
"step": 728
},
{
"epoch": 0.292,
"grad_norm": 5.0355963706970215,
"learning_rate": 1.78369345732584e-05,
"loss": 0.2513,
"step": 730
},
{
"epoch": 0.2928,
"grad_norm": 1.0326207876205444,
"learning_rate": 1.7819558320555902e-05,
"loss": 1.2558,
"step": 732
},
{
"epoch": 0.2936,
"grad_norm": 0.24173852801322937,
"learning_rate": 1.780212108936684e-05,
"loss": 0.0767,
"step": 734
},
{
"epoch": 0.2944,
"grad_norm": 0.20275752246379852,
"learning_rate": 1.7784623015670237e-05,
"loss": 0.059,
"step": 736
},
{
"epoch": 0.2952,
"grad_norm": 0.1756519377231598,
"learning_rate": 1.7767064235919594e-05,
"loss": 0.1225,
"step": 738
},
{
"epoch": 0.296,
"grad_norm": 0.09066800028085709,
"learning_rate": 1.7749444887041797e-05,
"loss": 0.2309,
"step": 740
},
{
"epoch": 0.2968,
"grad_norm": 2.341961622238159,
"learning_rate": 1.7731765106436073e-05,
"loss": 0.5643,
"step": 742
},
{
"epoch": 0.2976,
"grad_norm": 0.09298836439847946,
"learning_rate": 1.7714025031972904e-05,
"loss": 0.272,
"step": 744
},
{
"epoch": 0.2984,
"grad_norm": 0.15143701434135437,
"learning_rate": 1.7696224801992947e-05,
"loss": 0.0548,
"step": 746
},
{
"epoch": 0.2992,
"grad_norm": 1.298582911491394,
"learning_rate": 1.767836455530598e-05,
"loss": 0.2634,
"step": 748
},
{
"epoch": 0.3,
"grad_norm": 0.12245524674654007,
"learning_rate": 1.766044443118978e-05,
"loss": 0.113,
"step": 750
},
{
"epoch": 0.3008,
"grad_norm": 0.08619975298643112,
"learning_rate": 1.764246456938909e-05,
"loss": 0.0373,
"step": 752
},
{
"epoch": 0.3016,
"grad_norm": 1.6767048835754395,
"learning_rate": 1.762442511011448e-05,
"loss": 0.3271,
"step": 754
},
{
"epoch": 0.3024,
"grad_norm": 0.0463847815990448,
"learning_rate": 1.7606326194041274e-05,
"loss": 0.3972,
"step": 756
},
{
"epoch": 0.3032,
"grad_norm": 0.018276596441864967,
"learning_rate": 1.7588167962308458e-05,
"loss": 0.0141,
"step": 758
},
{
"epoch": 0.304,
"grad_norm": 0.1315947324037552,
"learning_rate": 1.7569950556517566e-05,
"loss": 0.0556,
"step": 760
},
{
"epoch": 0.3048,
"grad_norm": 0.5924662947654724,
"learning_rate": 1.7551674118731592e-05,
"loss": 0.1275,
"step": 762
},
{
"epoch": 0.3056,
"grad_norm": 0.23044385015964508,
"learning_rate": 1.7533338791473872e-05,
"loss": 0.0416,
"step": 764
},
{
"epoch": 0.3064,
"grad_norm": 1.4832113981246948,
"learning_rate": 1.7514944717726962e-05,
"loss": 0.4879,
"step": 766
},
{
"epoch": 0.3072,
"grad_norm": 0.08857022970914841,
"learning_rate": 1.749649204093155e-05,
"loss": 0.042,
"step": 768
},
{
"epoch": 0.308,
"grad_norm": 0.24603064358234406,
"learning_rate": 1.747798090498532e-05,
"loss": 0.2623,
"step": 770
},
{
"epoch": 0.3088,
"grad_norm": 0.08749913424253464,
"learning_rate": 1.7459411454241822e-05,
"loss": 0.0188,
"step": 772
},
{
"epoch": 0.3096,
"grad_norm": 0.10140682011842728,
"learning_rate": 1.7440783833509366e-05,
"loss": 0.0401,
"step": 774
},
{
"epoch": 0.3104,
"grad_norm": 0.027609799057245255,
"learning_rate": 1.7422098188049885e-05,
"loss": 0.0173,
"step": 776
},
{
"epoch": 0.3112,
"grad_norm": 1.5163832902908325,
"learning_rate": 1.7403354663577782e-05,
"loss": 0.654,
"step": 778
},
{
"epoch": 0.312,
"grad_norm": 0.4432819187641144,
"learning_rate": 1.7384553406258842e-05,
"loss": 0.0778,
"step": 780
},
{
"epoch": 0.3128,
"grad_norm": 0.006748716812580824,
"learning_rate": 1.7365694562709034e-05,
"loss": 0.0119,
"step": 782
},
{
"epoch": 0.3136,
"grad_norm": 0.023698939010500908,
"learning_rate": 1.7346778279993417e-05,
"loss": 0.013,
"step": 784
},
{
"epoch": 0.3144,
"grad_norm": 0.01780467852950096,
"learning_rate": 1.732780470562496e-05,
"loss": 0.015,
"step": 786
},
{
"epoch": 0.3152,
"grad_norm": 1.4911342859268188,
"learning_rate": 1.7308773987563406e-05,
"loss": 0.1485,
"step": 788
},
{
"epoch": 0.316,
"grad_norm": 0.030542919412255287,
"learning_rate": 1.7289686274214116e-05,
"loss": 0.0136,
"step": 790
},
{
"epoch": 0.3168,
"grad_norm": 0.039021674543619156,
"learning_rate": 1.727054171442692e-05,
"loss": 0.0715,
"step": 792
},
{
"epoch": 0.3176,
"grad_norm": 0.24402689933776855,
"learning_rate": 1.7251340457494934e-05,
"loss": 0.5099,
"step": 794
},
{
"epoch": 0.3184,
"grad_norm": 2.137239456176758,
"learning_rate": 1.7232082653153422e-05,
"loss": 0.5133,
"step": 796
},
{
"epoch": 0.3192,
"grad_norm": 2.8100578784942627,
"learning_rate": 1.721276845157861e-05,
"loss": 0.428,
"step": 798
},
{
"epoch": 0.32,
"grad_norm": 0.1050364151597023,
"learning_rate": 1.7193398003386514e-05,
"loss": 0.0221,
"step": 800
},
{
"epoch": 0.3208,
"grad_norm": 0.02302808314561844,
"learning_rate": 1.717397145963179e-05,
"loss": 0.0524,
"step": 802
},
{
"epoch": 0.3216,
"grad_norm": 0.5185064077377319,
"learning_rate": 1.715448897180652e-05,
"loss": 0.0452,
"step": 804
},
{
"epoch": 0.3224,
"grad_norm": 0.49271076917648315,
"learning_rate": 1.7134950691839063e-05,
"loss": 0.1355,
"step": 806
},
{
"epoch": 0.3232,
"grad_norm": 0.2875131666660309,
"learning_rate": 1.7115356772092858e-05,
"loss": 0.5409,
"step": 808
},
{
"epoch": 0.324,
"grad_norm": 2.0039379596710205,
"learning_rate": 1.709570736536521e-05,
"loss": 0.2663,
"step": 810
},
{
"epoch": 0.3248,
"grad_norm": 1.235532522201538,
"learning_rate": 1.7076002624886156e-05,
"loss": 0.391,
"step": 812
},
{
"epoch": 0.3256,
"grad_norm": 0.9183419942855835,
"learning_rate": 1.705624270431721e-05,
"loss": 0.1418,
"step": 814
},
{
"epoch": 0.3264,
"grad_norm": 0.5461841821670532,
"learning_rate": 1.7036427757750205e-05,
"loss": 0.1603,
"step": 816
},
{
"epoch": 0.3272,
"grad_norm": 0.373760849237442,
"learning_rate": 1.7016557939706075e-05,
"loss": 0.0709,
"step": 818
},
{
"epoch": 0.328,
"grad_norm": 0.40094730257987976,
"learning_rate": 1.6996633405133656e-05,
"loss": 0.0659,
"step": 820
},
{
"epoch": 0.3288,
"grad_norm": 0.05968625098466873,
"learning_rate": 1.6976654309408464e-05,
"loss": 0.0307,
"step": 822
},
{
"epoch": 0.3296,
"grad_norm": 0.22513960301876068,
"learning_rate": 1.695662080833151e-05,
"loss": 0.5364,
"step": 824
},
{
"epoch": 0.3304,
"grad_norm": 0.2083749920129776,
"learning_rate": 1.693653305812805e-05,
"loss": 0.0446,
"step": 826
},
{
"epoch": 0.3312,
"grad_norm": 0.00946067925542593,
"learning_rate": 1.6916391215446403e-05,
"loss": 0.0212,
"step": 828
},
{
"epoch": 0.332,
"grad_norm": 0.08354925364255905,
"learning_rate": 1.68961954373567e-05,
"loss": 0.0427,
"step": 830
},
{
"epoch": 0.3328,
"grad_norm": 0.04629332199692726,
"learning_rate": 1.6875945881349676e-05,
"loss": 0.0264,
"step": 832
},
{
"epoch": 0.3336,
"grad_norm": 0.05332889407873154,
"learning_rate": 1.6855642705335438e-05,
"loss": 0.0174,
"step": 834
},
{
"epoch": 0.3344,
"grad_norm": 0.04525836184620857,
"learning_rate": 1.6835286067642228e-05,
"loss": 0.0188,
"step": 836
},
{
"epoch": 0.3352,
"grad_norm": 1.4626930952072144,
"learning_rate": 1.68148761270152e-05,
"loss": 0.3748,
"step": 838
},
{
"epoch": 0.336,
"grad_norm": 0.023244062438607216,
"learning_rate": 1.6794413042615168e-05,
"loss": 0.1176,
"step": 840
},
{
"epoch": 0.3368,
"grad_norm": 0.06457129120826721,
"learning_rate": 1.6773896974017373e-05,
"loss": 0.5326,
"step": 842
},
{
"epoch": 0.3376,
"grad_norm": 0.0086558498442173,
"learning_rate": 1.6753328081210244e-05,
"loss": 0.1256,
"step": 844
},
{
"epoch": 0.3384,
"grad_norm": 0.005909595172852278,
"learning_rate": 1.6732706524594138e-05,
"loss": 0.0242,
"step": 846
},
{
"epoch": 0.3392,
"grad_norm": 0.0692615732550621,
"learning_rate": 1.6712032464980094e-05,
"loss": 0.0237,
"step": 848
},
{
"epoch": 0.34,
"grad_norm": 2.266417980194092,
"learning_rate": 1.6691306063588583e-05,
"loss": 0.3703,
"step": 850
},
{
"epoch": 0.3408,
"grad_norm": 2.5376994609832764,
"learning_rate": 1.6670527482048246e-05,
"loss": 0.3166,
"step": 852
},
{
"epoch": 0.3416,
"grad_norm": 0.02030259743332863,
"learning_rate": 1.6649696882394635e-05,
"loss": 0.0218,
"step": 854
},
{
"epoch": 0.3424,
"grad_norm": 0.09955969452857971,
"learning_rate": 1.6628814427068954e-05,
"loss": 0.0158,
"step": 856
},
{
"epoch": 0.3432,
"grad_norm": 1.0117660760879517,
"learning_rate": 1.6607880278916778e-05,
"loss": 0.4415,
"step": 858
},
{
"epoch": 0.344,
"grad_norm": 0.7470207214355469,
"learning_rate": 1.6586894601186804e-05,
"loss": 0.0473,
"step": 860
},
{
"epoch": 0.3448,
"grad_norm": 2.56488299369812,
"learning_rate": 1.6565857557529567e-05,
"loss": 0.5677,
"step": 862
},
{
"epoch": 0.3456,
"grad_norm": 0.06736627966165543,
"learning_rate": 1.654476931199615e-05,
"loss": 0.0131,
"step": 864
},
{
"epoch": 0.3464,
"grad_norm": 0.2571139931678772,
"learning_rate": 1.652363002903693e-05,
"loss": 0.1469,
"step": 866
},
{
"epoch": 0.3472,
"grad_norm": 0.21657270193099976,
"learning_rate": 1.650243987350029e-05,
"loss": 0.0489,
"step": 868
},
{
"epoch": 0.348,
"grad_norm": 0.02142491564154625,
"learning_rate": 1.6481199010631312e-05,
"loss": 0.0332,
"step": 870
},
{
"epoch": 0.3488,
"grad_norm": 0.1039031520485878,
"learning_rate": 1.6459907606070513e-05,
"loss": 0.2827,
"step": 872
},
{
"epoch": 0.3496,
"grad_norm": 0.07285499572753906,
"learning_rate": 1.643856582585254e-05,
"loss": 0.0334,
"step": 874
},
{
"epoch": 0.3504,
"grad_norm": 0.0627899169921875,
"learning_rate": 1.6417173836404888e-05,
"loss": 0.0265,
"step": 876
},
{
"epoch": 0.3512,
"grad_norm": 0.021432699635624886,
"learning_rate": 1.6395731804546582e-05,
"loss": 0.0292,
"step": 878
},
{
"epoch": 0.352,
"grad_norm": 0.07570278644561768,
"learning_rate": 1.63742398974869e-05,
"loss": 0.0153,
"step": 880
},
{
"epoch": 0.3528,
"grad_norm": 0.0861375704407692,
"learning_rate": 1.6352698282824045e-05,
"loss": 0.0148,
"step": 882
},
{
"epoch": 0.3536,
"grad_norm": 0.12202827632427216,
"learning_rate": 1.6331107128543856e-05,
"loss": 0.0365,
"step": 884
},
{
"epoch": 0.3544,
"grad_norm": 0.049717921763658524,
"learning_rate": 1.6309466603018497e-05,
"loss": 0.0202,
"step": 886
},
{
"epoch": 0.3552,
"grad_norm": 2.4151523113250732,
"learning_rate": 1.628777687500513e-05,
"loss": 0.4901,
"step": 888
},
{
"epoch": 0.356,
"grad_norm": 0.03276515007019043,
"learning_rate": 1.6266038113644605e-05,
"loss": 0.0243,
"step": 890
},
{
"epoch": 0.3568,
"grad_norm": 0.09225738793611526,
"learning_rate": 1.624425048846016e-05,
"loss": 0.0916,
"step": 892
},
{
"epoch": 0.3576,
"grad_norm": 0.06391174346208572,
"learning_rate": 1.6222414169356066e-05,
"loss": 0.0105,
"step": 894
},
{
"epoch": 0.3584,
"grad_norm": 0.03721031919121742,
"learning_rate": 1.620052932661633e-05,
"loss": 0.0123,
"step": 896
},
{
"epoch": 0.3592,
"grad_norm": 0.2955784201622009,
"learning_rate": 1.6178596130903345e-05,
"loss": 1.2386,
"step": 898
},
{
"epoch": 0.36,
"grad_norm": 0.05584976449608803,
"learning_rate": 1.6156614753256583e-05,
"loss": 0.0926,
"step": 900
},
{
"epoch": 0.3608,
"grad_norm": 1.0190123319625854,
"learning_rate": 1.6134585365091243e-05,
"loss": 0.1162,
"step": 902
},
{
"epoch": 0.3616,
"grad_norm": 0.09299857914447784,
"learning_rate": 1.611250813819692e-05,
"loss": 0.0425,
"step": 904
},
{
"epoch": 0.3624,
"grad_norm": 0.057083725929260254,
"learning_rate": 1.6090383244736256e-05,
"loss": 0.0206,
"step": 906
},
{
"epoch": 0.3632,
"grad_norm": 0.006012015510350466,
"learning_rate": 1.6068210857243625e-05,
"loss": 0.0242,
"step": 908
},
{
"epoch": 0.364,
"grad_norm": 0.04129006341099739,
"learning_rate": 1.6045991148623752e-05,
"loss": 0.0422,
"step": 910
},
{
"epoch": 0.3648,
"grad_norm": 0.049417100846767426,
"learning_rate": 1.6023724292150387e-05,
"loss": 0.0371,
"step": 912
},
{
"epoch": 0.3656,
"grad_norm": 0.13303126394748688,
"learning_rate": 1.6001410461464955e-05,
"loss": 0.2441,
"step": 914
},
{
"epoch": 0.3664,
"grad_norm": 0.1420883685350418,
"learning_rate": 1.597904983057519e-05,
"loss": 0.0301,
"step": 916
},
{
"epoch": 0.3672,
"grad_norm": 0.02255750633776188,
"learning_rate": 1.5956642573853784e-05,
"loss": 0.1678,
"step": 918
},
{
"epoch": 0.368,
"grad_norm": 2.711224317550659,
"learning_rate": 1.5934188866037017e-05,
"loss": 0.2587,
"step": 920
},
{
"epoch": 0.3688,
"grad_norm": 0.015457335859537125,
"learning_rate": 1.591168888222342e-05,
"loss": 0.0161,
"step": 922
},
{
"epoch": 0.3696,
"grad_norm": 0.021741507574915886,
"learning_rate": 1.5889142797872387e-05,
"loss": 0.0163,
"step": 924
},
{
"epoch": 0.3704,
"grad_norm": 2.3807716369628906,
"learning_rate": 1.5866550788802815e-05,
"loss": 0.2696,
"step": 926
},
{
"epoch": 0.3712,
"grad_norm": 1.5936973094940186,
"learning_rate": 1.5843913031191722e-05,
"loss": 0.4635,
"step": 928
},
{
"epoch": 0.372,
"grad_norm": 0.18653613328933716,
"learning_rate": 1.5821229701572897e-05,
"loss": 0.0413,
"step": 930
},
{
"epoch": 0.3728,
"grad_norm": 0.048368774354457855,
"learning_rate": 1.5798500976835493e-05,
"loss": 0.0504,
"step": 932
},
{
"epoch": 0.3736,
"grad_norm": 1.1611881256103516,
"learning_rate": 1.5775727034222675e-05,
"loss": 0.1021,
"step": 934
},
{
"epoch": 0.3744,
"grad_norm": 0.03345046192407608,
"learning_rate": 1.575290805133023e-05,
"loss": 0.0152,
"step": 936
},
{
"epoch": 0.3752,
"grad_norm": 0.035641297698020935,
"learning_rate": 1.5730044206105156e-05,
"loss": 0.0111,
"step": 938
},
{
"epoch": 0.376,
"grad_norm": 0.1988631784915924,
"learning_rate": 1.570713567684432e-05,
"loss": 0.0267,
"step": 940
},
{
"epoch": 0.3768,
"grad_norm": 0.10902003198862076,
"learning_rate": 1.568418264219303e-05,
"loss": 0.063,
"step": 942
},
{
"epoch": 0.3776,
"grad_norm": 0.25664544105529785,
"learning_rate": 1.5661185281143666e-05,
"loss": 0.6095,
"step": 944
},
{
"epoch": 0.3784,
"grad_norm": 0.02122955210506916,
"learning_rate": 1.5638143773034268e-05,
"loss": 0.0116,
"step": 946
},
{
"epoch": 0.3792,
"grad_norm": 0.05375152826309204,
"learning_rate": 1.5615058297547144e-05,
"loss": 0.2848,
"step": 948
},
{
"epoch": 0.38,
"grad_norm": 0.5861246585845947,
"learning_rate": 1.5591929034707468e-05,
"loss": 0.0492,
"step": 950
},
{
"epoch": 0.3808,
"grad_norm": 0.2694084048271179,
"learning_rate": 1.556875616488188e-05,
"loss": 0.0587,
"step": 952
},
{
"epoch": 0.3816,
"grad_norm": 0.1376236230134964,
"learning_rate": 1.5545539868777075e-05,
"loss": 0.094,
"step": 954
},
{
"epoch": 0.3824,
"grad_norm": 0.03570554405450821,
"learning_rate": 1.5522280327438388e-05,
"loss": 0.0584,
"step": 956
},
{
"epoch": 0.3832,
"grad_norm": 0.01179492473602295,
"learning_rate": 1.54989777222484e-05,
"loss": 0.4026,
"step": 958
},
{
"epoch": 0.384,
"grad_norm": 0.4097414016723633,
"learning_rate": 1.5475632234925505e-05,
"loss": 0.0395,
"step": 960
},
{
"epoch": 0.3848,
"grad_norm": 0.021319517865777016,
"learning_rate": 1.5452244047522504e-05,
"loss": 0.0063,
"step": 962
},
{
"epoch": 0.3856,
"grad_norm": 0.008077614940702915,
"learning_rate": 1.5428813342425177e-05,
"loss": 0.0087,
"step": 964
},
{
"epoch": 0.3864,
"grad_norm": 0.23517374694347382,
"learning_rate": 1.540534030235087e-05,
"loss": 0.0319,
"step": 966
},
{
"epoch": 0.3872,
"grad_norm": 0.5178306102752686,
"learning_rate": 1.5381825110347072e-05,
"loss": 0.077,
"step": 968
},
{
"epoch": 0.388,
"grad_norm": 0.15628471970558167,
"learning_rate": 1.5358267949789968e-05,
"loss": 0.026,
"step": 970
},
{
"epoch": 0.3888,
"grad_norm": 0.32474571466445923,
"learning_rate": 1.533466900438303e-05,
"loss": 0.049,
"step": 972
},
{
"epoch": 0.3896,
"grad_norm": 0.024446366354823112,
"learning_rate": 1.5311028458155567e-05,
"loss": 0.1731,
"step": 974
},
{
"epoch": 0.3904,
"grad_norm": 0.1003560721874237,
"learning_rate": 1.528734649546132e-05,
"loss": 0.0142,
"step": 976
},
{
"epoch": 0.3912,
"grad_norm": 0.05430926755070686,
"learning_rate": 1.526362330097698e-05,
"loss": 0.1951,
"step": 978
},
{
"epoch": 0.392,
"grad_norm": 0.0365879088640213,
"learning_rate": 1.5239859059700794e-05,
"loss": 0.0093,
"step": 980
},
{
"epoch": 0.3928,
"grad_norm": 0.006135095842182636,
"learning_rate": 1.5216053956951081e-05,
"loss": 0.0098,
"step": 982
},
{
"epoch": 0.3936,
"grad_norm": 0.25733840465545654,
"learning_rate": 1.5192208178364815e-05,
"loss": 0.0337,
"step": 984
},
{
"epoch": 0.3944,
"grad_norm": 1.6840555667877197,
"learning_rate": 1.5168321909896171e-05,
"loss": 0.0434,
"step": 986
},
{
"epoch": 0.3952,
"grad_norm": 0.12561841309070587,
"learning_rate": 1.5144395337815066e-05,
"loss": 1.176,
"step": 988
},
{
"epoch": 0.396,
"grad_norm": 0.03084614872932434,
"learning_rate": 1.5120428648705716e-05,
"loss": 0.0051,
"step": 990
},
{
"epoch": 0.3968,
"grad_norm": 0.08889269083738327,
"learning_rate": 1.5096422029465178e-05,
"loss": 0.0162,
"step": 992
},
{
"epoch": 0.3976,
"grad_norm": 0.012536651454865932,
"learning_rate": 1.5072375667301893e-05,
"loss": 0.0087,
"step": 994
},
{
"epoch": 0.3984,
"grad_norm": 0.08808624744415283,
"learning_rate": 1.504828974973422e-05,
"loss": 0.254,
"step": 996
},
{
"epoch": 0.3992,
"grad_norm": 0.00895402766764164,
"learning_rate": 1.5024164464588982e-05,
"loss": 0.0067,
"step": 998
},
{
"epoch": 0.4,
"grad_norm": 0.01615263894200325,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.0474,
"step": 1000
},
{
"epoch": 0.4008,
"grad_norm": 0.11077135056257248,
"learning_rate": 1.4975796544406627e-05,
"loss": 0.0484,
"step": 1002
},
{
"epoch": 0.4016,
"grad_norm": 1.5227055549621582,
"learning_rate": 1.4951554286552266e-05,
"loss": 0.88,
"step": 1004
},
{
"epoch": 0.4024,
"grad_norm": 0.11261726170778275,
"learning_rate": 1.4927273415482916e-05,
"loss": 0.0309,
"step": 1006
},
{
"epoch": 0.4032,
"grad_norm": 3.3870084285736084,
"learning_rate": 1.4902954120545687e-05,
"loss": 0.2266,
"step": 1008
},
{
"epoch": 0.404,
"grad_norm": 0.6803337335586548,
"learning_rate": 1.4878596591387329e-05,
"loss": 0.0631,
"step": 1010
},
{
"epoch": 0.4048,
"grad_norm": 0.2337927520275116,
"learning_rate": 1.485420101795274e-05,
"loss": 0.0601,
"step": 1012
},
{
"epoch": 0.4056,
"grad_norm": 2.0759363174438477,
"learning_rate": 1.4829767590483508e-05,
"loss": 0.1077,
"step": 1014
},
{
"epoch": 0.4064,
"grad_norm": 0.16511759161949158,
"learning_rate": 1.4805296499516408e-05,
"loss": 0.0283,
"step": 1016
},
{
"epoch": 0.4072,
"grad_norm": 0.013055981136858463,
"learning_rate": 1.4780787935881925e-05,
"loss": 0.0067,
"step": 1018
},
{
"epoch": 0.408,
"grad_norm": 1.3341008424758911,
"learning_rate": 1.4756242090702756e-05,
"loss": 0.1319,
"step": 1020
},
{
"epoch": 0.4088,
"grad_norm": 0.21409232914447784,
"learning_rate": 1.4731659155392332e-05,
"loss": 0.0223,
"step": 1022
},
{
"epoch": 0.4096,
"grad_norm": 0.007527098525315523,
"learning_rate": 1.470703932165333e-05,
"loss": 0.008,
"step": 1024
},
{
"epoch": 0.4104,
"grad_norm": 0.02161354571580887,
"learning_rate": 1.4682382781476146e-05,
"loss": 0.0093,
"step": 1026
},
{
"epoch": 0.4112,
"grad_norm": 3.1243057250976562,
"learning_rate": 1.4657689727137443e-05,
"loss": 0.2774,
"step": 1028
},
{
"epoch": 0.412,
"grad_norm": 0.7081921696662903,
"learning_rate": 1.463296035119862e-05,
"loss": 0.3953,
"step": 1030
},
{
"epoch": 0.4128,
"grad_norm": 0.23352546989917755,
"learning_rate": 1.4608194846504311e-05,
"loss": 0.0156,
"step": 1032
},
{
"epoch": 0.4136,
"grad_norm": 0.44725605845451355,
"learning_rate": 1.4583393406180898e-05,
"loss": 0.0288,
"step": 1034
},
{
"epoch": 0.4144,
"grad_norm": 0.10920588672161102,
"learning_rate": 1.4558556223635004e-05,
"loss": 0.0283,
"step": 1036
},
{
"epoch": 0.4152,
"grad_norm": 0.004958951845765114,
"learning_rate": 1.4533683492551954e-05,
"loss": 0.0138,
"step": 1038
},
{
"epoch": 0.416,
"grad_norm": 0.009044544771313667,
"learning_rate": 1.4508775406894308e-05,
"loss": 0.0509,
"step": 1040
},
{
"epoch": 0.4168,
"grad_norm": 4.271718502044678,
"learning_rate": 1.4483832160900326e-05,
"loss": 0.6421,
"step": 1042
},
{
"epoch": 0.4176,
"grad_norm": 0.005885216407477856,
"learning_rate": 1.4458853949082443e-05,
"loss": 0.0097,
"step": 1044
},
{
"epoch": 0.4184,
"grad_norm": 0.004338196478784084,
"learning_rate": 1.4433840966225772e-05,
"loss": 0.0071,
"step": 1046
},
{
"epoch": 0.4192,
"grad_norm": 0.0037080624606460333,
"learning_rate": 1.4408793407386587e-05,
"loss": 0.012,
"step": 1048
},
{
"epoch": 0.42,
"grad_norm": 0.032303668558597565,
"learning_rate": 1.4383711467890776e-05,
"loss": 0.0057,
"step": 1050
},
{
"epoch": 0.4208,
"grad_norm": 0.006096419878304005,
"learning_rate": 1.4358595343332342e-05,
"loss": 0.0036,
"step": 1052
},
{
"epoch": 0.4216,
"grad_norm": 0.018376469612121582,
"learning_rate": 1.4333445229571874e-05,
"loss": 0.0039,
"step": 1054
},
{
"epoch": 0.4224,
"grad_norm": 0.05468170344829559,
"learning_rate": 1.4308261322735006e-05,
"loss": 0.0417,
"step": 1056
},
{
"epoch": 0.4232,
"grad_norm": 0.033594775944948196,
"learning_rate": 1.4283043819210905e-05,
"loss": 0.0169,
"step": 1058
},
{
"epoch": 0.424,
"grad_norm": 0.006937976460903883,
"learning_rate": 1.4257792915650728e-05,
"loss": 0.0705,
"step": 1060
},
{
"epoch": 0.4248,
"grad_norm": 0.01067473366856575,
"learning_rate": 1.4232508808966097e-05,
"loss": 0.0099,
"step": 1062
},
{
"epoch": 0.4256,
"grad_norm": 0.032820601016283035,
"learning_rate": 1.420719169632755e-05,
"loss": 0.0063,
"step": 1064
},
{
"epoch": 0.4264,
"grad_norm": 4.574073314666748,
"learning_rate": 1.4181841775163014e-05,
"loss": 1.646,
"step": 1066
},
{
"epoch": 0.4272,
"grad_norm": 2.232372999191284,
"learning_rate": 1.415645924315628e-05,
"loss": 0.0774,
"step": 1068
},
{
"epoch": 0.428,
"grad_norm": 0.007644816767424345,
"learning_rate": 1.413104429824542e-05,
"loss": 0.0042,
"step": 1070
},
{
"epoch": 0.4288,
"grad_norm": 0.041802894324064255,
"learning_rate": 1.4105597138621281e-05,
"loss": 0.0084,
"step": 1072
},
{
"epoch": 0.4296,
"grad_norm": 1.1193236112594604,
"learning_rate": 1.4080117962725929e-05,
"loss": 0.5387,
"step": 1074
},
{
"epoch": 0.4304,
"grad_norm": 0.04370833560824394,
"learning_rate": 1.4054606969251095e-05,
"loss": 0.0382,
"step": 1076
},
{
"epoch": 0.4312,
"grad_norm": 0.9142640233039856,
"learning_rate": 1.4029064357136628e-05,
"loss": 0.8189,
"step": 1078
},
{
"epoch": 0.432,
"grad_norm": 0.30186763405799866,
"learning_rate": 1.4003490325568953e-05,
"loss": 0.0461,
"step": 1080
},
{
"epoch": 0.4328,
"grad_norm": 0.05774744972586632,
"learning_rate": 1.39778850739795e-05,
"loss": 0.6106,
"step": 1082
},
{
"epoch": 0.4336,
"grad_norm": 0.3426492214202881,
"learning_rate": 1.3952248802043166e-05,
"loss": 0.0866,
"step": 1084
},
{
"epoch": 0.4344,
"grad_norm": 0.5425324440002441,
"learning_rate": 1.3926581709676752e-05,
"loss": 0.1086,
"step": 1086
},
{
"epoch": 0.4352,
"grad_norm": 0.10301554948091507,
"learning_rate": 1.3900883997037398e-05,
"loss": 0.0468,
"step": 1088
},
{
"epoch": 0.436,
"grad_norm": 1.1397227048873901,
"learning_rate": 1.3875155864521031e-05,
"loss": 0.2403,
"step": 1090
},
{
"epoch": 0.4368,
"grad_norm": 0.02772480994462967,
"learning_rate": 1.3849397512760797e-05,
"loss": 0.1271,
"step": 1092
},
{
"epoch": 0.4376,
"grad_norm": 0.10023550689220428,
"learning_rate": 1.3823609142625492e-05,
"loss": 0.0979,
"step": 1094
},
{
"epoch": 0.4384,
"grad_norm": 1.5255858898162842,
"learning_rate": 1.3797790955218014e-05,
"loss": 0.7019,
"step": 1096
},
{
"epoch": 0.4392,
"grad_norm": 0.0757753774523735,
"learning_rate": 1.3771943151873768e-05,
"loss": 0.0422,
"step": 1098
},
{
"epoch": 0.44,
"grad_norm": 0.053413309156894684,
"learning_rate": 1.3746065934159123e-05,
"loss": 0.4052,
"step": 1100
},
{
"epoch": 0.4408,
"grad_norm": 0.061455827206373215,
"learning_rate": 1.3720159503869816e-05,
"loss": 0.0402,
"step": 1102
},
{
"epoch": 0.4416,
"grad_norm": 0.3662703335285187,
"learning_rate": 1.3694224063029396e-05,
"loss": 0.0559,
"step": 1104
},
{
"epoch": 0.4424,
"grad_norm": 0.09608737379312515,
"learning_rate": 1.3668259813887644e-05,
"loss": 0.0703,
"step": 1106
},
{
"epoch": 0.4432,
"grad_norm": 0.09362676739692688,
"learning_rate": 1.3642266958918985e-05,
"loss": 0.0191,
"step": 1108
},
{
"epoch": 0.444,
"grad_norm": 0.03751781955361366,
"learning_rate": 1.3616245700820922e-05,
"loss": 0.1236,
"step": 1110
},
{
"epoch": 0.4448,
"grad_norm": 2.1613173484802246,
"learning_rate": 1.3590196242512463e-05,
"loss": 0.6019,
"step": 1112
},
{
"epoch": 0.4456,
"grad_norm": 0.03422972559928894,
"learning_rate": 1.3564118787132507e-05,
"loss": 0.0138,
"step": 1114
},
{
"epoch": 0.4464,
"grad_norm": 0.01046049501746893,
"learning_rate": 1.3538013538038295e-05,
"loss": 0.0101,
"step": 1116
},
{
"epoch": 0.4472,
"grad_norm": 0.7763115167617798,
"learning_rate": 1.3511880698803801e-05,
"loss": 0.1013,
"step": 1118
},
{
"epoch": 0.448,
"grad_norm": 0.2062385082244873,
"learning_rate": 1.3485720473218153e-05,
"loss": 0.0343,
"step": 1120
},
{
"epoch": 0.4488,
"grad_norm": 0.3189225196838379,
"learning_rate": 1.3459533065284049e-05,
"loss": 0.0661,
"step": 1122
},
{
"epoch": 0.4496,
"grad_norm": 0.9145419597625732,
"learning_rate": 1.3433318679216154e-05,
"loss": 0.555,
"step": 1124
},
{
"epoch": 0.4504,
"grad_norm": 1.2541166543960571,
"learning_rate": 1.340707751943952e-05,
"loss": 0.5902,
"step": 1126
},
{
"epoch": 0.4512,
"grad_norm": 0.02262170799076557,
"learning_rate": 1.3380809790587975e-05,
"loss": 0.0191,
"step": 1128
},
{
"epoch": 0.452,
"grad_norm": 0.4805641174316406,
"learning_rate": 1.3354515697502552e-05,
"loss": 0.0719,
"step": 1130
},
{
"epoch": 0.4528,
"grad_norm": 0.0759739875793457,
"learning_rate": 1.3328195445229869e-05,
"loss": 0.035,
"step": 1132
},
{
"epoch": 0.4536,
"grad_norm": 0.0279012992978096,
"learning_rate": 1.3301849239020537e-05,
"loss": 0.0732,
"step": 1134
},
{
"epoch": 0.4544,
"grad_norm": 0.040366217494010925,
"learning_rate": 1.327547728432757e-05,
"loss": 0.036,
"step": 1136
},
{
"epoch": 0.4552,
"grad_norm": 1.5210012197494507,
"learning_rate": 1.3249079786804765e-05,
"loss": 0.1844,
"step": 1138
},
{
"epoch": 0.456,
"grad_norm": 0.26963767409324646,
"learning_rate": 1.3222656952305113e-05,
"loss": 0.0458,
"step": 1140
},
{
"epoch": 0.4568,
"grad_norm": 2.5474393367767334,
"learning_rate": 1.319620898687918e-05,
"loss": 0.4129,
"step": 1142
},
{
"epoch": 0.4576,
"grad_norm": 0.021995197981595993,
"learning_rate": 1.316973609677352e-05,
"loss": 0.0168,
"step": 1144
},
{
"epoch": 0.4584,
"grad_norm": 0.0323915109038353,
"learning_rate": 1.3143238488429042e-05,
"loss": 0.109,
"step": 1146
},
{
"epoch": 0.4592,
"grad_norm": 0.050105806440114975,
"learning_rate": 1.3116716368479418e-05,
"loss": 0.0305,
"step": 1148
},
{
"epoch": 0.46,
"grad_norm": 0.13928213715553284,
"learning_rate": 1.3090169943749475e-05,
"loss": 0.0237,
"step": 1150
},
{
"epoch": 0.4608,
"grad_norm": 0.06462297588586807,
"learning_rate": 1.306359942125356e-05,
"loss": 0.0192,
"step": 1152
},
{
"epoch": 0.4616,
"grad_norm": 0.022758983075618744,
"learning_rate": 1.3037005008193944e-05,
"loss": 0.7191,
"step": 1154
},
{
"epoch": 0.4624,
"grad_norm": 0.03799004480242729,
"learning_rate": 1.3010386911959207e-05,
"loss": 0.0214,
"step": 1156
},
{
"epoch": 0.4632,
"grad_norm": 0.2102702260017395,
"learning_rate": 1.2983745340122604e-05,
"loss": 0.1163,
"step": 1158
},
{
"epoch": 0.464,
"grad_norm": 0.08294696360826492,
"learning_rate": 1.2957080500440469e-05,
"loss": 0.0268,
"step": 1160
},
{
"epoch": 0.4648,
"grad_norm": 0.04941265285015106,
"learning_rate": 1.2930392600850574e-05,
"loss": 0.0154,
"step": 1162
},
{
"epoch": 0.4656,
"grad_norm": 0.05030106008052826,
"learning_rate": 1.2903681849470528e-05,
"loss": 0.1079,
"step": 1164
},
{
"epoch": 0.4664,
"grad_norm": 0.11340274661779404,
"learning_rate": 1.287694845459613e-05,
"loss": 0.018,
"step": 1166
},
{
"epoch": 0.4672,
"grad_norm": 0.06605678051710129,
"learning_rate": 1.2850192624699762e-05,
"loss": 0.0336,
"step": 1168
},
{
"epoch": 0.468,
"grad_norm": 1.597208023071289,
"learning_rate": 1.2823414568428767e-05,
"loss": 0.7895,
"step": 1170
},
{
"epoch": 0.4688,
"grad_norm": 0.12116717547178268,
"learning_rate": 1.27966144946038e-05,
"loss": 0.026,
"step": 1172
},
{
"epoch": 0.4696,
"grad_norm": 0.04143265634775162,
"learning_rate": 1.2769792612217224e-05,
"loss": 0.1011,
"step": 1174
},
{
"epoch": 0.4704,
"grad_norm": 0.09404078125953674,
"learning_rate": 1.2742949130431468e-05,
"loss": 0.0211,
"step": 1176
},
{
"epoch": 0.4712,
"grad_norm": 0.024886123836040497,
"learning_rate": 1.2716084258577388e-05,
"loss": 0.012,
"step": 1178
},
{
"epoch": 0.472,
"grad_norm": 0.05635695159435272,
"learning_rate": 1.2689198206152657e-05,
"loss": 0.0137,
"step": 1180
},
{
"epoch": 0.4728,
"grad_norm": 0.09039817750453949,
"learning_rate": 1.2662291182820115e-05,
"loss": 0.0537,
"step": 1182
},
{
"epoch": 0.4736,
"grad_norm": 0.41814690828323364,
"learning_rate": 1.263536339840613e-05,
"loss": 0.0814,
"step": 1184
},
{
"epoch": 0.4744,
"grad_norm": 0.23532457649707794,
"learning_rate": 1.2608415062898971e-05,
"loss": 0.0322,
"step": 1186
},
{
"epoch": 0.4752,
"grad_norm": 0.009926537983119488,
"learning_rate": 1.2581446386447178e-05,
"loss": 0.1281,
"step": 1188
},
{
"epoch": 0.476,
"grad_norm": 0.4470027983188629,
"learning_rate": 1.2554457579357906e-05,
"loss": 0.0478,
"step": 1190
},
{
"epoch": 0.4768,
"grad_norm": 0.05581334978342056,
"learning_rate": 1.2527448852095295e-05,
"loss": 0.0226,
"step": 1192
},
{
"epoch": 0.4776,
"grad_norm": 0.06944628804922104,
"learning_rate": 1.2500420415278822e-05,
"loss": 0.0229,
"step": 1194
},
{
"epoch": 0.4784,
"grad_norm": 0.11777735501527786,
"learning_rate": 1.2473372479681671e-05,
"loss": 0.0189,
"step": 1196
},
{
"epoch": 0.4792,
"grad_norm": 0.37742823362350464,
"learning_rate": 1.2446305256229074e-05,
"loss": 0.2176,
"step": 1198
},
{
"epoch": 0.48,
"grad_norm": 1.1584476232528687,
"learning_rate": 1.2419218955996677e-05,
"loss": 0.5342,
"step": 1200
},
{
"epoch": 0.4808,
"grad_norm": 0.21147167682647705,
"learning_rate": 1.2392113790208895e-05,
"loss": 0.027,
"step": 1202
},
{
"epoch": 0.4816,
"grad_norm": 0.061478614807128906,
"learning_rate": 1.236498997023725e-05,
"loss": 0.027,
"step": 1204
},
{
"epoch": 0.4824,
"grad_norm": 0.02769525721669197,
"learning_rate": 1.2337847707598738e-05,
"loss": 0.0325,
"step": 1206
},
{
"epoch": 0.4832,
"grad_norm": 0.6708433628082275,
"learning_rate": 1.2310687213954182e-05,
"loss": 0.1057,
"step": 1208
},
{
"epoch": 0.484,
"grad_norm": 0.08236505091190338,
"learning_rate": 1.2283508701106559e-05,
"loss": 0.0192,
"step": 1210
},
{
"epoch": 0.4848,
"grad_norm": 0.01167634129524231,
"learning_rate": 1.2256312380999376e-05,
"loss": 0.3463,
"step": 1212
},
{
"epoch": 0.4856,
"grad_norm": 0.03433239459991455,
"learning_rate": 1.2229098465715005e-05,
"loss": 0.3755,
"step": 1214
},
{
"epoch": 0.4864,
"grad_norm": 1.2798956632614136,
"learning_rate": 1.2201867167473015e-05,
"loss": 0.3444,
"step": 1216
},
{
"epoch": 0.4872,
"grad_norm": 0.003344225697219372,
"learning_rate": 1.217461869862855e-05,
"loss": 0.0047,
"step": 1218
},
{
"epoch": 0.488,
"grad_norm": 0.006934499368071556,
"learning_rate": 1.2147353271670634e-05,
"loss": 0.0031,
"step": 1220
},
{
"epoch": 0.4888,
"grad_norm": 0.04046230763196945,
"learning_rate": 1.212007109922055e-05,
"loss": 0.0566,
"step": 1222
},
{
"epoch": 0.4896,
"grad_norm": 1.6989425420761108,
"learning_rate": 1.2092772394030153e-05,
"loss": 0.3364,
"step": 1224
},
{
"epoch": 0.4904,
"grad_norm": 0.6641682982444763,
"learning_rate": 1.2065457368980236e-05,
"loss": 0.0774,
"step": 1226
},
{
"epoch": 0.4912,
"grad_norm": 0.07043974846601486,
"learning_rate": 1.203812623707885e-05,
"loss": 0.0186,
"step": 1228
},
{
"epoch": 0.492,
"grad_norm": 0.01750028319656849,
"learning_rate": 1.2010779211459649e-05,
"loss": 0.1607,
"step": 1230
},
{
"epoch": 0.4928,
"grad_norm": 0.14301519095897675,
"learning_rate": 1.1983416505380234e-05,
"loss": 0.0485,
"step": 1232
},
{
"epoch": 0.4936,
"grad_norm": 0.046129919588565826,
"learning_rate": 1.1956038332220484e-05,
"loss": 0.0111,
"step": 1234
},
{
"epoch": 0.4944,
"grad_norm": 1.4883394241333008,
"learning_rate": 1.192864490548089e-05,
"loss": 0.5768,
"step": 1236
},
{
"epoch": 0.4952,
"grad_norm": 1.122119665145874,
"learning_rate": 1.1901236438780902e-05,
"loss": 0.1481,
"step": 1238
},
{
"epoch": 0.496,
"grad_norm": 0.19744625687599182,
"learning_rate": 1.187381314585725e-05,
"loss": 0.1095,
"step": 1240
},
{
"epoch": 0.4968,
"grad_norm": 0.06524749845266342,
"learning_rate": 1.184637524056227e-05,
"loss": 0.1115,
"step": 1242
},
{
"epoch": 0.4976,
"grad_norm": 0.024867022410035133,
"learning_rate": 1.181892293686227e-05,
"loss": 0.0095,
"step": 1244
},
{
"epoch": 0.4984,
"grad_norm": 0.20855647325515747,
"learning_rate": 1.1791456448835825e-05,
"loss": 0.0315,
"step": 1246
},
{
"epoch": 0.4992,
"grad_norm": 0.009501822292804718,
"learning_rate": 1.1763975990672125e-05,
"loss": 0.0212,
"step": 1248
},
{
"epoch": 0.5,
"grad_norm": 0.030127333477139473,
"learning_rate": 1.1736481776669307e-05,
"loss": 0.0104,
"step": 1250
},
{
"epoch": 0.5008,
"grad_norm": 0.793460488319397,
"learning_rate": 1.1708974021232768e-05,
"loss": 0.1835,
"step": 1252
},
{
"epoch": 0.5016,
"grad_norm": 0.1162368431687355,
"learning_rate": 1.1681452938873516e-05,
"loss": 0.0165,
"step": 1254
},
{
"epoch": 0.5024,
"grad_norm": 0.03160820156335831,
"learning_rate": 1.1653918744206478e-05,
"loss": 0.0659,
"step": 1256
},
{
"epoch": 0.5032,
"grad_norm": 2.536365509033203,
"learning_rate": 1.1626371651948839e-05,
"loss": 0.1661,
"step": 1258
},
{
"epoch": 0.504,
"grad_norm": 0.06747017800807953,
"learning_rate": 1.159881187691835e-05,
"loss": 0.0591,
"step": 1260
},
{
"epoch": 0.5048,
"grad_norm": 0.10115383565425873,
"learning_rate": 1.157123963403168e-05,
"loss": 0.0216,
"step": 1262
},
{
"epoch": 0.5056,
"grad_norm": 0.017214270308613777,
"learning_rate": 1.1543655138302714e-05,
"loss": 0.0077,
"step": 1264
},
{
"epoch": 0.5064,
"grad_norm": 0.21995119750499725,
"learning_rate": 1.1516058604840891e-05,
"loss": 0.0389,
"step": 1266
},
{
"epoch": 0.5072,
"grad_norm": 0.01604562997817993,
"learning_rate": 1.1488450248849523e-05,
"loss": 0.0223,
"step": 1268
},
{
"epoch": 0.508,
"grad_norm": 0.031004084274172783,
"learning_rate": 1.1460830285624119e-05,
"loss": 0.0159,
"step": 1270
},
{
"epoch": 0.5088,
"grad_norm": 2.251960039138794,
"learning_rate": 1.1433198930550694e-05,
"loss": 0.2915,
"step": 1272
},
{
"epoch": 0.5096,
"grad_norm": 0.03171626105904579,
"learning_rate": 1.140555639910411e-05,
"loss": 0.0069,
"step": 1274
},
{
"epoch": 0.5104,
"grad_norm": 0.791187584400177,
"learning_rate": 1.137790290684638e-05,
"loss": 0.21,
"step": 1276
},
{
"epoch": 0.5112,
"grad_norm": 0.016608070582151413,
"learning_rate": 1.1350238669424993e-05,
"loss": 0.0064,
"step": 1278
},
{
"epoch": 0.512,
"grad_norm": 0.0038915143813937902,
"learning_rate": 1.1322563902571227e-05,
"loss": 0.002,
"step": 1280
},
{
"epoch": 0.5128,
"grad_norm": 0.2625635862350464,
"learning_rate": 1.129487882209847e-05,
"loss": 0.0156,
"step": 1282
},
{
"epoch": 0.5136,
"grad_norm": 0.037254203110933304,
"learning_rate": 1.1267183643900548e-05,
"loss": 0.0056,
"step": 1284
},
{
"epoch": 0.5144,
"grad_norm": 0.08309903740882874,
"learning_rate": 1.1239478583950019e-05,
"loss": 0.0056,
"step": 1286
},
{
"epoch": 0.5152,
"grad_norm": 1.2419265508651733,
"learning_rate": 1.1211763858296507e-05,
"loss": 0.1099,
"step": 1288
},
{
"epoch": 0.516,
"grad_norm": 0.008148876950144768,
"learning_rate": 1.1184039683065014e-05,
"loss": 0.0064,
"step": 1290
},
{
"epoch": 0.5168,
"grad_norm": 0.04519444331526756,
"learning_rate": 1.1156306274454218e-05,
"loss": 0.0066,
"step": 1292
},
{
"epoch": 0.5176,
"grad_norm": 0.0015114143025130033,
"learning_rate": 1.1128563848734817e-05,
"loss": 0.001,
"step": 1294
},
{
"epoch": 0.5184,
"grad_norm": 0.006196278613060713,
"learning_rate": 1.1100812622247823e-05,
"loss": 0.0032,
"step": 1296
},
{
"epoch": 0.5192,
"grad_norm": 0.0075135682709515095,
"learning_rate": 1.1073052811402867e-05,
"loss": 0.6506,
"step": 1298
},
{
"epoch": 0.52,
"grad_norm": 0.030847422778606415,
"learning_rate": 1.1045284632676535e-05,
"loss": 0.016,
"step": 1300
},
{
"epoch": 0.5208,
"grad_norm": 0.07433111220598221,
"learning_rate": 1.1017508302610665e-05,
"loss": 0.0134,
"step": 1302
},
{
"epoch": 0.5216,
"grad_norm": 0.20086438953876495,
"learning_rate": 1.0989724037810651e-05,
"loss": 0.0139,
"step": 1304
},
{
"epoch": 0.5224,
"grad_norm": 0.015072687529027462,
"learning_rate": 1.0961932054943778e-05,
"loss": 0.0192,
"step": 1306
},
{
"epoch": 0.5232,
"grad_norm": 0.5555530190467834,
"learning_rate": 1.0934132570737508e-05,
"loss": 0.0359,
"step": 1308
},
{
"epoch": 0.524,
"grad_norm": 0.0621090903878212,
"learning_rate": 1.0906325801977804e-05,
"loss": 0.0126,
"step": 1310
},
{
"epoch": 0.5248,
"grad_norm": 0.013316777534782887,
"learning_rate": 1.0878511965507435e-05,
"loss": 0.0085,
"step": 1312
},
{
"epoch": 0.5256,
"grad_norm": 0.037804149091243744,
"learning_rate": 1.0850691278224282e-05,
"loss": 1.6496,
"step": 1314
},
{
"epoch": 0.5264,
"grad_norm": 0.04232160374522209,
"learning_rate": 1.0822863957079657e-05,
"loss": 0.0068,
"step": 1316
},
{
"epoch": 0.5272,
"grad_norm": 0.020469985902309418,
"learning_rate": 1.07950302190766e-05,
"loss": 0.1708,
"step": 1318
},
{
"epoch": 0.528,
"grad_norm": 0.04445105418562889,
"learning_rate": 1.0767190281268187e-05,
"loss": 0.1707,
"step": 1320
},
{
"epoch": 0.5288,
"grad_norm": 1.9354827404022217,
"learning_rate": 1.0739344360755853e-05,
"loss": 0.1453,
"step": 1322
},
{
"epoch": 0.5296,
"grad_norm": 0.0484926700592041,
"learning_rate": 1.071149267468767e-05,
"loss": 0.0299,
"step": 1324
},
{
"epoch": 0.5304,
"grad_norm": 0.02645016647875309,
"learning_rate": 1.0683635440256689e-05,
"loss": 0.0162,
"step": 1326
},
{
"epoch": 0.5312,
"grad_norm": 0.05882829427719116,
"learning_rate": 1.0655772874699217e-05,
"loss": 0.0311,
"step": 1328
},
{
"epoch": 0.532,
"grad_norm": 0.03740588575601578,
"learning_rate": 1.0627905195293135e-05,
"loss": 0.0254,
"step": 1330
},
{
"epoch": 0.5328,
"grad_norm": 0.21913115680217743,
"learning_rate": 1.0600032619356208e-05,
"loss": 0.0732,
"step": 1332
},
{
"epoch": 0.5336,
"grad_norm": 0.02981780469417572,
"learning_rate": 1.0572155364244383e-05,
"loss": 0.0603,
"step": 1334
},
{
"epoch": 0.5344,
"grad_norm": 0.3409574627876282,
"learning_rate": 1.0544273647350091e-05,
"loss": 0.0482,
"step": 1336
},
{
"epoch": 0.5352,
"grad_norm": 1.3873035907745361,
"learning_rate": 1.0516387686100566e-05,
"loss": 0.2645,
"step": 1338
},
{
"epoch": 0.536,
"grad_norm": 0.9133062958717346,
"learning_rate": 1.0488497697956134e-05,
"loss": 0.6023,
"step": 1340
},
{
"epoch": 0.5368,
"grad_norm": 0.014215713366866112,
"learning_rate": 1.0460603900408523e-05,
"loss": 0.0249,
"step": 1342
},
{
"epoch": 0.5376,
"grad_norm": 0.013020013459026814,
"learning_rate": 1.0432706510979172e-05,
"loss": 0.0572,
"step": 1344
},
{
"epoch": 0.5384,
"grad_norm": 0.07060626894235611,
"learning_rate": 1.0404805747217525e-05,
"loss": 0.0259,
"step": 1346
},
{
"epoch": 0.5392,
"grad_norm": 0.16781005263328552,
"learning_rate": 1.0376901826699349e-05,
"loss": 0.0441,
"step": 1348
},
{
"epoch": 0.54,
"grad_norm": 0.0290207639336586,
"learning_rate": 1.0348994967025012e-05,
"loss": 0.9059,
"step": 1350
},
{
"epoch": 0.5408,
"grad_norm": 0.03659132868051529,
"learning_rate": 1.0321085385817818e-05,
"loss": 0.6523,
"step": 1352
},
{
"epoch": 0.5416,
"grad_norm": 0.10811670869588852,
"learning_rate": 1.0293173300722286e-05,
"loss": 0.0472,
"step": 1354
},
{
"epoch": 0.5424,
"grad_norm": 0.04781627655029297,
"learning_rate": 1.026525892940246e-05,
"loss": 0.0203,
"step": 1356
},
{
"epoch": 0.5432,
"grad_norm": 0.0950162261724472,
"learning_rate": 1.0237342489540221e-05,
"loss": 0.5411,
"step": 1358
},
{
"epoch": 0.544,
"grad_norm": 0.059019722044467926,
"learning_rate": 1.0209424198833571e-05,
"loss": 0.049,
"step": 1360
},
{
"epoch": 0.5448,
"grad_norm": 0.0552980937063694,
"learning_rate": 1.0181504274994949e-05,
"loss": 0.5069,
"step": 1362
},
{
"epoch": 0.5456,
"grad_norm": 0.037155695259571075,
"learning_rate": 1.0153582935749531e-05,
"loss": 0.0366,
"step": 1364
},
{
"epoch": 0.5464,
"grad_norm": 0.32015174627304077,
"learning_rate": 1.0125660398833528e-05,
"loss": 0.1187,
"step": 1366
},
{
"epoch": 0.5472,
"grad_norm": 0.0431252084672451,
"learning_rate": 1.0097736881992492e-05,
"loss": 0.0293,
"step": 1368
},
{
"epoch": 0.548,
"grad_norm": 0.05704091861844063,
"learning_rate": 1.0069812602979617e-05,
"loss": 0.0485,
"step": 1370
},
{
"epoch": 0.5488,
"grad_norm": 0.0650290697813034,
"learning_rate": 1.0041887779554041e-05,
"loss": 0.0455,
"step": 1372
},
{
"epoch": 0.5496,
"grad_norm": 2.1595537662506104,
"learning_rate": 1.0013962629479145e-05,
"loss": 0.1577,
"step": 1374
},
{
"epoch": 0.5504,
"grad_norm": 0.05115671455860138,
"learning_rate": 9.986037370520856e-06,
"loss": 0.0318,
"step": 1376
},
{
"epoch": 0.5512,
"grad_norm": 0.06018679961562157,
"learning_rate": 9.958112220445964e-06,
"loss": 0.0382,
"step": 1378
},
{
"epoch": 0.552,
"grad_norm": 0.0702604129910469,
"learning_rate": 9.930187397020385e-06,
"loss": 0.5824,
"step": 1380
},
{
"epoch": 0.5528,
"grad_norm": 0.038693517446517944,
"learning_rate": 9.902263118007513e-06,
"loss": 0.0264,
"step": 1382
},
{
"epoch": 0.5536,
"grad_norm": 0.018939625471830368,
"learning_rate": 9.874339601166474e-06,
"loss": 0.0496,
"step": 1384
},
{
"epoch": 0.5544,
"grad_norm": 0.043414074927568436,
"learning_rate": 9.84641706425047e-06,
"loss": 0.0181,
"step": 1386
},
{
"epoch": 0.5552,
"grad_norm": 0.5982060432434082,
"learning_rate": 9.818495725005053e-06,
"loss": 0.0623,
"step": 1388
},
{
"epoch": 0.556,
"grad_norm": 0.13789403438568115,
"learning_rate": 9.790575801166432e-06,
"loss": 0.3926,
"step": 1390
},
{
"epoch": 0.5568,
"grad_norm": 0.029673095792531967,
"learning_rate": 9.762657510459784e-06,
"loss": 0.3696,
"step": 1392
},
{
"epoch": 0.5576,
"grad_norm": 1.172825813293457,
"learning_rate": 9.73474107059754e-06,
"loss": 0.8355,
"step": 1394
},
{
"epoch": 0.5584,
"grad_norm": 1.2738497257232666,
"learning_rate": 9.706826699277719e-06,
"loss": 0.2447,
"step": 1396
},
{
"epoch": 0.5592,
"grad_norm": 1.911895751953125,
"learning_rate": 9.678914614182185e-06,
"loss": 0.2138,
"step": 1398
},
{
"epoch": 0.56,
"grad_norm": 0.17634525895118713,
"learning_rate": 9.651005032974994e-06,
"loss": 0.0485,
"step": 1400
},
{
"epoch": 0.5608,
"grad_norm": 0.03239009529352188,
"learning_rate": 9.623098173300655e-06,
"loss": 0.0351,
"step": 1402
},
{
"epoch": 0.5616,
"grad_norm": 0.12052161246538162,
"learning_rate": 9.595194252782476e-06,
"loss": 0.0429,
"step": 1404
},
{
"epoch": 0.5624,
"grad_norm": 0.06105445697903633,
"learning_rate": 9.567293489020831e-06,
"loss": 0.0241,
"step": 1406
},
{
"epoch": 0.5632,
"grad_norm": 0.10784406960010529,
"learning_rate": 9.539396099591477e-06,
"loss": 0.0291,
"step": 1408
},
{
"epoch": 0.564,
"grad_norm": 0.5862268209457397,
"learning_rate": 9.511502302043867e-06,
"loss": 0.0746,
"step": 1410
},
{
"epoch": 0.5648,
"grad_norm": 2.016383409500122,
"learning_rate": 9.483612313899436e-06,
"loss": 0.1858,
"step": 1412
},
{
"epoch": 0.5656,
"grad_norm": 0.05418021231889725,
"learning_rate": 9.45572635264991e-06,
"loss": 0.0362,
"step": 1414
},
{
"epoch": 0.5664,
"grad_norm": 1.7044521570205688,
"learning_rate": 9.42784463575562e-06,
"loss": 0.2979,
"step": 1416
},
{
"epoch": 0.5672,
"grad_norm": 0.09087449312210083,
"learning_rate": 9.399967380643795e-06,
"loss": 0.0365,
"step": 1418
},
{
"epoch": 0.568,
"grad_norm": 0.16950923204421997,
"learning_rate": 9.372094804706867e-06,
"loss": 0.0382,
"step": 1420
},
{
"epoch": 0.5688,
"grad_norm": 0.1982814371585846,
"learning_rate": 9.344227125300788e-06,
"loss": 0.0395,
"step": 1422
},
{
"epoch": 0.5696,
"grad_norm": 1.310996413230896,
"learning_rate": 9.316364559743315e-06,
"loss": 0.5231,
"step": 1424
},
{
"epoch": 0.5704,
"grad_norm": 0.03387328237295151,
"learning_rate": 9.288507325312334e-06,
"loss": 0.6156,
"step": 1426
},
{
"epoch": 0.5712,
"grad_norm": 0.009328456595540047,
"learning_rate": 9.260655639244152e-06,
"loss": 0.0184,
"step": 1428
},
{
"epoch": 0.572,
"grad_norm": 0.07381106168031693,
"learning_rate": 9.232809718731815e-06,
"loss": 0.0261,
"step": 1430
},
{
"epoch": 0.5728,
"grad_norm": 0.050106290727853775,
"learning_rate": 9.204969780923404e-06,
"loss": 0.0132,
"step": 1432
},
{
"epoch": 0.5736,
"grad_norm": 0.010202400386333466,
"learning_rate": 9.177136042920344e-06,
"loss": 0.0187,
"step": 1434
},
{
"epoch": 0.5744,
"grad_norm": 0.15420140326023102,
"learning_rate": 9.14930872177572e-06,
"loss": 0.062,
"step": 1436
},
{
"epoch": 0.5752,
"grad_norm": 0.028073610737919807,
"learning_rate": 9.121488034492569e-06,
"loss": 0.0171,
"step": 1438
},
{
"epoch": 0.576,
"grad_norm": 0.018873965367674828,
"learning_rate": 9.093674198022201e-06,
"loss": 0.019,
"step": 1440
},
{
"epoch": 0.5768,
"grad_norm": 0.8036087155342102,
"learning_rate": 9.065867429262497e-06,
"loss": 1.608,
"step": 1442
},
{
"epoch": 0.5776,
"grad_norm": 0.19082395732402802,
"learning_rate": 9.038067945056229e-06,
"loss": 0.0394,
"step": 1444
},
{
"epoch": 0.5784,
"grad_norm": 0.07714349031448364,
"learning_rate": 9.01027596218935e-06,
"loss": 0.0772,
"step": 1446
},
{
"epoch": 0.5792,
"grad_norm": 0.06311635673046112,
"learning_rate": 8.982491697389339e-06,
"loss": 0.032,
"step": 1448
},
{
"epoch": 0.58,
"grad_norm": 0.13729214668273926,
"learning_rate": 8.954715367323468e-06,
"loss": 0.0404,
"step": 1450
},
{
"epoch": 0.5808,
"grad_norm": 0.6665918827056885,
"learning_rate": 8.926947188597133e-06,
"loss": 0.2674,
"step": 1452
},
{
"epoch": 0.5816,
"grad_norm": 0.11501887440681458,
"learning_rate": 8.89918737775218e-06,
"loss": 0.066,
"step": 1454
},
{
"epoch": 0.5824,
"grad_norm": 0.04181879013776779,
"learning_rate": 8.871436151265183e-06,
"loss": 0.0176,
"step": 1456
},
{
"epoch": 0.5832,
"grad_norm": 0.07718206197023392,
"learning_rate": 8.843693725545787e-06,
"loss": 0.3476,
"step": 1458
},
{
"epoch": 0.584,
"grad_norm": 0.17924383282661438,
"learning_rate": 8.815960316934991e-06,
"loss": 0.0852,
"step": 1460
},
{
"epoch": 0.5848,
"grad_norm": 0.016910772770643234,
"learning_rate": 8.788236141703498e-06,
"loss": 0.0112,
"step": 1462
},
{
"epoch": 0.5856,
"grad_norm": 0.20013374090194702,
"learning_rate": 8.760521416049983e-06,
"loss": 0.0349,
"step": 1464
},
{
"epoch": 0.5864,
"grad_norm": 0.0546087808907032,
"learning_rate": 8.732816356099455e-06,
"loss": 0.0229,
"step": 1466
},
{
"epoch": 0.5872,
"grad_norm": 0.1050657406449318,
"learning_rate": 8.705121177901532e-06,
"loss": 0.0342,
"step": 1468
},
{
"epoch": 0.588,
"grad_norm": 0.00835899356752634,
"learning_rate": 8.677436097428775e-06,
"loss": 0.2512,
"step": 1470
},
{
"epoch": 0.5888,
"grad_norm": 0.12563996016979218,
"learning_rate": 8.649761330575009e-06,
"loss": 0.0289,
"step": 1472
},
{
"epoch": 0.5896,
"grad_norm": 2.070195436477661,
"learning_rate": 8.62209709315362e-06,
"loss": 0.5447,
"step": 1474
},
{
"epoch": 0.5904,
"grad_norm": 0.20926696062088013,
"learning_rate": 8.594443600895892e-06,
"loss": 0.0291,
"step": 1476
},
{
"epoch": 0.5912,
"grad_norm": 0.03596784546971321,
"learning_rate": 8.566801069449307e-06,
"loss": 0.0246,
"step": 1478
},
{
"epoch": 0.592,
"grad_norm": 0.03978172317147255,
"learning_rate": 8.539169714375885e-06,
"loss": 0.1662,
"step": 1480
},
{
"epoch": 0.5928,
"grad_norm": 0.030970891937613487,
"learning_rate": 8.511549751150478e-06,
"loss": 0.1755,
"step": 1482
},
{
"epoch": 0.5936,
"grad_norm": 0.029246820136904716,
"learning_rate": 8.483941395159114e-06,
"loss": 0.0162,
"step": 1484
},
{
"epoch": 0.5944,
"grad_norm": 1.2437587976455688,
"learning_rate": 8.45634486169729e-06,
"loss": 0.2451,
"step": 1486
},
{
"epoch": 0.5952,
"grad_norm": 0.01518918015062809,
"learning_rate": 8.428760365968327e-06,
"loss": 0.0139,
"step": 1488
},
{
"epoch": 0.596,
"grad_norm": 0.058590278029441833,
"learning_rate": 8.401188123081653e-06,
"loss": 0.1158,
"step": 1490
},
{
"epoch": 0.5968,
"grad_norm": 0.09938734769821167,
"learning_rate": 8.373628348051165e-06,
"loss": 0.0162,
"step": 1492
},
{
"epoch": 0.5976,
"grad_norm": 0.221556156873703,
"learning_rate": 8.346081255793524e-06,
"loss": 0.0708,
"step": 1494
},
{
"epoch": 0.5984,
"grad_norm": 0.9571221470832825,
"learning_rate": 8.318547061126485e-06,
"loss": 0.3115,
"step": 1496
},
{
"epoch": 0.5992,
"grad_norm": 0.38105764985084534,
"learning_rate": 8.291025978767236e-06,
"loss": 0.2224,
"step": 1498
},
{
"epoch": 0.6,
"grad_norm": 1.5868642330169678,
"learning_rate": 8.263518223330698e-06,
"loss": 0.2413,
"step": 1500
},
{
"epoch": 0.6008,
"grad_norm": 0.23962773382663727,
"learning_rate": 8.236024009327879e-06,
"loss": 0.0339,
"step": 1502
},
{
"epoch": 0.6016,
"grad_norm": 0.5392745137214661,
"learning_rate": 8.208543551164178e-06,
"loss": 0.0817,
"step": 1504
},
{
"epoch": 0.6024,
"grad_norm": 0.06242334842681885,
"learning_rate": 8.181077063137733e-06,
"loss": 0.5676,
"step": 1506
},
{
"epoch": 0.6032,
"grad_norm": 0.05859392136335373,
"learning_rate": 8.153624759437733e-06,
"loss": 0.0143,
"step": 1508
},
{
"epoch": 0.604,
"grad_norm": 0.20578180253505707,
"learning_rate": 8.126186854142752e-06,
"loss": 0.0336,
"step": 1510
},
{
"epoch": 0.6048,
"grad_norm": 2.3235504627227783,
"learning_rate": 8.098763561219101e-06,
"loss": 0.5843,
"step": 1512
},
{
"epoch": 0.6056,
"grad_norm": 0.014721088111400604,
"learning_rate": 8.07135509451911e-06,
"loss": 0.0786,
"step": 1514
},
{
"epoch": 0.6064,
"grad_norm": 0.029024440795183182,
"learning_rate": 8.04396166777952e-06,
"loss": 0.0238,
"step": 1516
},
{
"epoch": 0.6072,
"grad_norm": 0.007215852849185467,
"learning_rate": 8.016583494619769e-06,
"loss": 0.0082,
"step": 1518
},
{
"epoch": 0.608,
"grad_norm": 0.8240259885787964,
"learning_rate": 7.989220788540356e-06,
"loss": 0.1647,
"step": 1520
},
{
"epoch": 0.6088,
"grad_norm": 0.3796282708644867,
"learning_rate": 7.961873762921153e-06,
"loss": 0.0807,
"step": 1522
},
{
"epoch": 0.6096,
"grad_norm": 0.03532170131802559,
"learning_rate": 7.934542631019767e-06,
"loss": 0.5003,
"step": 1524
},
{
"epoch": 0.6104,
"grad_norm": 0.018144795671105385,
"learning_rate": 7.907227605969849e-06,
"loss": 0.0202,
"step": 1526
},
{
"epoch": 0.6112,
"grad_norm": 0.06521397829055786,
"learning_rate": 7.879928900779457e-06,
"loss": 0.0214,
"step": 1528
},
{
"epoch": 0.612,
"grad_norm": 0.010673885233700275,
"learning_rate": 7.852646728329368e-06,
"loss": 0.2308,
"step": 1530
},
{
"epoch": 0.6128,
"grad_norm": 0.07566576451063156,
"learning_rate": 7.825381301371452e-06,
"loss": 0.0437,
"step": 1532
},
{
"epoch": 0.6136,
"grad_norm": 0.01657886430621147,
"learning_rate": 7.798132832526986e-06,
"loss": 0.0092,
"step": 1534
},
{
"epoch": 0.6144,
"grad_norm": 0.04421677067875862,
"learning_rate": 7.770901534284996e-06,
"loss": 0.2318,
"step": 1536
},
{
"epoch": 0.6152,
"grad_norm": 0.3701765239238739,
"learning_rate": 7.743687619000625e-06,
"loss": 0.0437,
"step": 1538
},
{
"epoch": 0.616,
"grad_norm": 0.006094436626881361,
"learning_rate": 7.716491298893443e-06,
"loss": 0.0216,
"step": 1540
},
{
"epoch": 0.6168,
"grad_norm": 0.27293258905410767,
"learning_rate": 7.689312786045823e-06,
"loss": 0.0417,
"step": 1542
},
{
"epoch": 0.6176,
"grad_norm": 0.2565436363220215,
"learning_rate": 7.662152292401265e-06,
"loss": 0.0669,
"step": 1544
},
{
"epoch": 0.6184,
"grad_norm": 0.9066870212554932,
"learning_rate": 7.635010029762755e-06,
"loss": 0.1723,
"step": 1546
},
{
"epoch": 0.6192,
"grad_norm": 0.040010981261730194,
"learning_rate": 7.6078862097911075e-06,
"loss": 0.0133,
"step": 1548
},
{
"epoch": 0.62,
"grad_norm": 0.7351746559143066,
"learning_rate": 7.580781044003324e-06,
"loss": 0.0992,
"step": 1550
},
{
"epoch": 0.6208,
"grad_norm": 0.6215207576751709,
"learning_rate": 7.553694743770928e-06,
"loss": 0.6671,
"step": 1552
},
{
"epoch": 0.6216,
"grad_norm": 0.0056650955229997635,
"learning_rate": 7.526627520318329e-06,
"loss": 0.0339,
"step": 1554
},
{
"epoch": 0.6224,
"grad_norm": 0.008909706026315689,
"learning_rate": 7.49957958472118e-06,
"loss": 0.0059,
"step": 1556
},
{
"epoch": 0.6232,
"grad_norm": 1.9135671854019165,
"learning_rate": 7.472551147904708e-06,
"loss": 0.9156,
"step": 1558
},
{
"epoch": 0.624,
"grad_norm": 0.7585664391517639,
"learning_rate": 7.445542420642097e-06,
"loss": 0.103,
"step": 1560
},
{
"epoch": 0.6248,
"grad_norm": 0.07106231898069382,
"learning_rate": 7.418553613552824e-06,
"loss": 0.0217,
"step": 1562
},
{
"epoch": 0.6256,
"grad_norm": 1.5444700717926025,
"learning_rate": 7.391584937101034e-06,
"loss": 0.1937,
"step": 1564
},
{
"epoch": 0.6264,
"grad_norm": 0.006505718920379877,
"learning_rate": 7.364636601593875e-06,
"loss": 0.014,
"step": 1566
},
{
"epoch": 0.6272,
"grad_norm": 0.3871181309223175,
"learning_rate": 7.33770881717989e-06,
"loss": 0.048,
"step": 1568
},
{
"epoch": 0.628,
"grad_norm": 0.03062368370592594,
"learning_rate": 7.310801793847344e-06,
"loss": 0.0133,
"step": 1570
},
{
"epoch": 0.6288,
"grad_norm": 0.2106819897890091,
"learning_rate": 7.283915741422611e-06,
"loss": 0.2936,
"step": 1572
},
{
"epoch": 0.6296,
"grad_norm": 0.9387131929397583,
"learning_rate": 7.257050869568536e-06,
"loss": 0.3943,
"step": 1574
},
{
"epoch": 0.6304,
"grad_norm": 0.18858036398887634,
"learning_rate": 7.2302073877827775e-06,
"loss": 0.2703,
"step": 1576
},
{
"epoch": 0.6312,
"grad_norm": 0.34716084599494934,
"learning_rate": 7.203385505396203e-06,
"loss": 0.0523,
"step": 1578
},
{
"epoch": 0.632,
"grad_norm": 1.6694183349609375,
"learning_rate": 7.176585431571235e-06,
"loss": 0.4431,
"step": 1580
},
{
"epoch": 0.6328,
"grad_norm": 0.07224322855472565,
"learning_rate": 7.149807375300239e-06,
"loss": 0.1626,
"step": 1582
},
{
"epoch": 0.6336,
"grad_norm": 0.03537129983305931,
"learning_rate": 7.123051545403874e-06,
"loss": 0.0113,
"step": 1584
},
{
"epoch": 0.6344,
"grad_norm": 0.0862552747130394,
"learning_rate": 7.096318150529476e-06,
"loss": 0.1374,
"step": 1586
},
{
"epoch": 0.6352,
"grad_norm": 0.3428609073162079,
"learning_rate": 7.069607399149427e-06,
"loss": 0.0467,
"step": 1588
},
{
"epoch": 0.636,
"grad_norm": 0.8518670201301575,
"learning_rate": 7.042919499559538e-06,
"loss": 0.179,
"step": 1590
},
{
"epoch": 0.6368,
"grad_norm": 0.017964590340852737,
"learning_rate": 7.016254659877398e-06,
"loss": 0.0767,
"step": 1592
},
{
"epoch": 0.6376,
"grad_norm": 0.042712751775979996,
"learning_rate": 6.9896130880407965e-06,
"loss": 0.0153,
"step": 1594
},
{
"epoch": 0.6384,
"grad_norm": 0.3006531894207001,
"learning_rate": 6.962994991806059e-06,
"loss": 0.0977,
"step": 1596
},
{
"epoch": 0.6392,
"grad_norm": 0.010738243348896503,
"learning_rate": 6.9364005787464406e-06,
"loss": 0.1063,
"step": 1598
},
{
"epoch": 0.64,
"grad_norm": 0.017853038385510445,
"learning_rate": 6.909830056250527e-06,
"loss": 0.1487,
"step": 1600
},
{
"epoch": 0.6408,
"grad_norm": 0.38184410333633423,
"learning_rate": 6.883283631520582e-06,
"loss": 0.0403,
"step": 1602
},
{
"epoch": 0.6416,
"grad_norm": 0.026657408103346825,
"learning_rate": 6.856761511570963e-06,
"loss": 0.0139,
"step": 1604
},
{
"epoch": 0.6424,
"grad_norm": 0.04037747532129288,
"learning_rate": 6.830263903226483e-06,
"loss": 0.0218,
"step": 1606
},
{
"epoch": 0.6432,
"grad_norm": 0.07742556184530258,
"learning_rate": 6.803791013120822e-06,
"loss": 0.0117,
"step": 1608
},
{
"epoch": 0.644,
"grad_norm": 1.956659197807312,
"learning_rate": 6.777343047694891e-06,
"loss": 0.3377,
"step": 1610
},
{
"epoch": 0.6448,
"grad_norm": 0.04879617318511009,
"learning_rate": 6.750920213195238e-06,
"loss": 0.1139,
"step": 1612
},
{
"epoch": 0.6456,
"grad_norm": 3.4786341190338135,
"learning_rate": 6.7245227156724324e-06,
"loss": 0.0851,
"step": 1614
},
{
"epoch": 0.6464,
"grad_norm": 0.018991775810718536,
"learning_rate": 6.698150760979463e-06,
"loss": 0.0136,
"step": 1616
},
{
"epoch": 0.6472,
"grad_norm": 1.7258912324905396,
"learning_rate": 6.671804554770135e-06,
"loss": 0.3864,
"step": 1618
},
{
"epoch": 0.648,
"grad_norm": 0.27930495142936707,
"learning_rate": 6.645484302497452e-06,
"loss": 0.0221,
"step": 1620
},
{
"epoch": 0.6488,
"grad_norm": 0.023989146575331688,
"learning_rate": 6.6191902094120295e-06,
"loss": 0.0198,
"step": 1622
},
{
"epoch": 0.6496,
"grad_norm": 0.026416273787617683,
"learning_rate": 6.5929224805604845e-06,
"loss": 0.0195,
"step": 1624
},
{
"epoch": 0.6504,
"grad_norm": 0.021918591111898422,
"learning_rate": 6.566681320783849e-06,
"loss": 0.0088,
"step": 1626
},
{
"epoch": 0.6512,
"grad_norm": 0.005978676024824381,
"learning_rate": 6.540466934715953e-06,
"loss": 0.0158,
"step": 1628
},
{
"epoch": 0.652,
"grad_norm": 0.0690799206495285,
"learning_rate": 6.5142795267818505e-06,
"loss": 0.0426,
"step": 1630
},
{
"epoch": 0.6528,
"grad_norm": 0.06811723858118057,
"learning_rate": 6.488119301196201e-06,
"loss": 0.0474,
"step": 1632
},
{
"epoch": 0.6536,
"grad_norm": 0.3594679534435272,
"learning_rate": 6.461986461961706e-06,
"loss": 0.0359,
"step": 1634
},
{
"epoch": 0.6544,
"grad_norm": 0.00533846952021122,
"learning_rate": 6.435881212867494e-06,
"loss": 0.0055,
"step": 1636
},
{
"epoch": 0.6552,
"grad_norm": 0.4549589455127716,
"learning_rate": 6.409803757487539e-06,
"loss": 0.0631,
"step": 1638
},
{
"epoch": 0.656,
"grad_norm": 0.02013424225151539,
"learning_rate": 6.383754299179079e-06,
"loss": 0.0046,
"step": 1640
},
{
"epoch": 0.6568,
"grad_norm": 2.2123377323150635,
"learning_rate": 6.357733041081018e-06,
"loss": 0.482,
"step": 1642
},
{
"epoch": 0.6576,
"grad_norm": 0.0015303940745070577,
"learning_rate": 6.33174018611236e-06,
"loss": 0.0028,
"step": 1644
},
{
"epoch": 0.6584,
"grad_norm": 0.2125168740749359,
"learning_rate": 6.305775936970606e-06,
"loss": 0.019,
"step": 1646
},
{
"epoch": 0.6592,
"grad_norm": 1.2157135009765625,
"learning_rate": 6.27984049613019e-06,
"loss": 0.4542,
"step": 1648
},
{
"epoch": 0.66,
"grad_norm": 0.030942708253860474,
"learning_rate": 6.25393406584088e-06,
"loss": 0.0049,
"step": 1650
},
{
"epoch": 0.6608,
"grad_norm": 0.0280720554292202,
"learning_rate": 6.228056848126236e-06,
"loss": 0.0107,
"step": 1652
},
{
"epoch": 0.6616,
"grad_norm": 0.6233726739883423,
"learning_rate": 6.202209044781991e-06,
"loss": 0.0566,
"step": 1654
},
{
"epoch": 0.6624,
"grad_norm": 0.014730525203049183,
"learning_rate": 6.176390857374508e-06,
"loss": 0.0088,
"step": 1656
},
{
"epoch": 0.6632,
"grad_norm": 1.784373164176941,
"learning_rate": 6.150602487239207e-06,
"loss": 0.7035,
"step": 1658
},
{
"epoch": 0.664,
"grad_norm": 0.12886707484722137,
"learning_rate": 6.124844135478971e-06,
"loss": 0.0122,
"step": 1660
},
{
"epoch": 0.6648,
"grad_norm": 0.04210919514298439,
"learning_rate": 6.099116002962604e-06,
"loss": 0.1507,
"step": 1662
},
{
"epoch": 0.6656,
"grad_norm": 0.05602734535932541,
"learning_rate": 6.073418290323251e-06,
"loss": 0.0145,
"step": 1664
},
{
"epoch": 0.6664,
"grad_norm": 0.04478934034705162,
"learning_rate": 6.047751197956838e-06,
"loss": 0.0079,
"step": 1666
},
{
"epoch": 0.6672,
"grad_norm": 0.010290348902344704,
"learning_rate": 6.022114926020504e-06,
"loss": 0.0495,
"step": 1668
},
{
"epoch": 0.668,
"grad_norm": 0.01751306839287281,
"learning_rate": 5.996509674431053e-06,
"loss": 0.0056,
"step": 1670
},
{
"epoch": 0.6688,
"grad_norm": 0.1504068821668625,
"learning_rate": 5.970935642863375e-06,
"loss": 0.0234,
"step": 1672
},
{
"epoch": 0.6696,
"grad_norm": 0.0935196727514267,
"learning_rate": 5.94539303074891e-06,
"loss": 0.0214,
"step": 1674
},
{
"epoch": 0.6704,
"grad_norm": 1.2794703245162964,
"learning_rate": 5.9198820372740726e-06,
"loss": 0.8835,
"step": 1676
},
{
"epoch": 0.6712,
"grad_norm": 0.028086047619581223,
"learning_rate": 5.894402861378721e-06,
"loss": 0.0076,
"step": 1678
},
{
"epoch": 0.672,
"grad_norm": 0.2483760118484497,
"learning_rate": 5.868955701754584e-06,
"loss": 0.5054,
"step": 1680
},
{
"epoch": 0.6728,
"grad_norm": 0.016343148425221443,
"learning_rate": 5.843540756843722e-06,
"loss": 0.1617,
"step": 1682
},
{
"epoch": 0.6736,
"grad_norm": 0.05262403190135956,
"learning_rate": 5.818158224836987e-06,
"loss": 0.0523,
"step": 1684
},
{
"epoch": 0.6744,
"grad_norm": 0.11089430004358292,
"learning_rate": 5.792808303672454e-06,
"loss": 0.4788,
"step": 1686
},
{
"epoch": 0.6752,
"grad_norm": 0.0768204778432846,
"learning_rate": 5.7674911910339094e-06,
"loss": 0.0263,
"step": 1688
},
{
"epoch": 0.676,
"grad_norm": 0.7316690683364868,
"learning_rate": 5.742207084349274e-06,
"loss": 0.064,
"step": 1690
},
{
"epoch": 0.6768,
"grad_norm": 0.12514914572238922,
"learning_rate": 5.716956180789098e-06,
"loss": 0.0287,
"step": 1692
},
{
"epoch": 0.6776,
"grad_norm": 0.12199829518795013,
"learning_rate": 5.691738677265e-06,
"loss": 0.1082,
"step": 1694
},
{
"epoch": 0.6784,
"grad_norm": 0.15735796093940735,
"learning_rate": 5.666554770428129e-06,
"loss": 0.0281,
"step": 1696
},
{
"epoch": 0.6792,
"grad_norm": 2.5249431133270264,
"learning_rate": 5.641404656667661e-06,
"loss": 0.8569,
"step": 1698
},
{
"epoch": 0.68,
"grad_norm": 0.12266776710748672,
"learning_rate": 5.616288532109225e-06,
"loss": 0.0213,
"step": 1700
},
{
"epoch": 0.6808,
"grad_norm": 0.07756412774324417,
"learning_rate": 5.591206592613416e-06,
"loss": 0.02,
"step": 1702
},
{
"epoch": 0.6816,
"grad_norm": 0.025011537596583366,
"learning_rate": 5.5661590337742255e-06,
"loss": 0.0081,
"step": 1704
},
{
"epoch": 0.6824,
"grad_norm": 0.24261169135570526,
"learning_rate": 5.5411460509175605e-06,
"loss": 0.0702,
"step": 1706
},
{
"epoch": 0.6832,
"grad_norm": 0.04961364343762398,
"learning_rate": 5.516167839099679e-06,
"loss": 0.0143,
"step": 1708
},
{
"epoch": 0.684,
"grad_norm": 0.0382879376411438,
"learning_rate": 5.491224593105695e-06,
"loss": 0.0174,
"step": 1710
},
{
"epoch": 0.6848,
"grad_norm": 0.024480927735567093,
"learning_rate": 5.466316507448049e-06,
"loss": 0.021,
"step": 1712
},
{
"epoch": 0.6856,
"grad_norm": 0.355844646692276,
"learning_rate": 5.441443776365003e-06,
"loss": 0.0201,
"step": 1714
},
{
"epoch": 0.6864,
"grad_norm": 0.10836853832006454,
"learning_rate": 5.416606593819102e-06,
"loss": 0.018,
"step": 1716
},
{
"epoch": 0.6872,
"grad_norm": 0.1412193775177002,
"learning_rate": 5.391805153495693e-06,
"loss": 0.0284,
"step": 1718
},
{
"epoch": 0.688,
"grad_norm": 0.15320326387882233,
"learning_rate": 5.367039648801386e-06,
"loss": 0.0345,
"step": 1720
},
{
"epoch": 0.6888,
"grad_norm": 0.07995325326919556,
"learning_rate": 5.342310272862558e-06,
"loss": 0.0102,
"step": 1722
},
{
"epoch": 0.6896,
"grad_norm": 0.04907960444688797,
"learning_rate": 5.317617218523856e-06,
"loss": 0.0391,
"step": 1724
},
{
"epoch": 0.6904,
"grad_norm": 1.2691978216171265,
"learning_rate": 5.292960678346674e-06,
"loss": 0.402,
"step": 1726
},
{
"epoch": 0.6912,
"grad_norm": 0.008955995552241802,
"learning_rate": 5.26834084460767e-06,
"loss": 0.0053,
"step": 1728
},
{
"epoch": 0.692,
"grad_norm": 0.25324296951293945,
"learning_rate": 5.243757909297247e-06,
"loss": 0.035,
"step": 1730
},
{
"epoch": 0.6928,
"grad_norm": 0.026979120448231697,
"learning_rate": 5.219212064118079e-06,
"loss": 0.0089,
"step": 1732
},
{
"epoch": 0.6936,
"grad_norm": 0.015840429812669754,
"learning_rate": 5.194703500483593e-06,
"loss": 0.0049,
"step": 1734
},
{
"epoch": 0.6944,
"grad_norm": 1.4288071393966675,
"learning_rate": 5.1702324095164955e-06,
"loss": 0.899,
"step": 1736
},
{
"epoch": 0.6952,
"grad_norm": 1.0387349128723145,
"learning_rate": 5.145798982047261e-06,
"loss": 0.7296,
"step": 1738
},
{
"epoch": 0.696,
"grad_norm": 0.4432249963283539,
"learning_rate": 5.121403408612672e-06,
"loss": 0.0525,
"step": 1740
},
{
"epoch": 0.6968,
"grad_norm": 0.021967697888612747,
"learning_rate": 5.0970458794543135e-06,
"loss": 0.0092,
"step": 1742
},
{
"epoch": 0.6976,
"grad_norm": 1.2726808786392212,
"learning_rate": 5.072726584517086e-06,
"loss": 0.1722,
"step": 1744
},
{
"epoch": 0.6984,
"grad_norm": 0.01904178597033024,
"learning_rate": 5.048445713447738e-06,
"loss": 0.0209,
"step": 1746
},
{
"epoch": 0.6992,
"grad_norm": 0.5619332194328308,
"learning_rate": 5.024203455593375e-06,
"loss": 0.769,
"step": 1748
},
{
"epoch": 0.7,
"grad_norm": 1.2965996265411377,
"learning_rate": 5.000000000000003e-06,
"loss": 0.1219,
"step": 1750
},
{
"epoch": 0.7008,
"grad_norm": 0.08266191929578781,
"learning_rate": 4.97583553541102e-06,
"loss": 0.0208,
"step": 1752
},
{
"epoch": 0.7016,
"grad_norm": 0.016945960000157356,
"learning_rate": 4.951710250265785e-06,
"loss": 0.0105,
"step": 1754
},
{
"epoch": 0.7024,
"grad_norm": 0.06696458905935287,
"learning_rate": 4.927624332698109e-06,
"loss": 0.0239,
"step": 1756
},
{
"epoch": 0.7032,
"grad_norm": 0.024396728724241257,
"learning_rate": 4.903577970534823e-06,
"loss": 0.0168,
"step": 1758
},
{
"epoch": 0.704,
"grad_norm": 0.3054494857788086,
"learning_rate": 4.879571351294287e-06,
"loss": 0.0534,
"step": 1760
},
{
"epoch": 0.7048,
"grad_norm": 0.038238685578107834,
"learning_rate": 4.855604662184935e-06,
"loss": 0.3647,
"step": 1762
},
{
"epoch": 0.7056,
"grad_norm": 0.23634418845176697,
"learning_rate": 4.831678090103832e-06,
"loss": 0.0938,
"step": 1764
},
{
"epoch": 0.7064,
"grad_norm": 0.15138259530067444,
"learning_rate": 4.807791821635186e-06,
"loss": 0.0674,
"step": 1766
},
{
"epoch": 0.7072,
"grad_norm": 0.24614623188972473,
"learning_rate": 4.783946043048922e-06,
"loss": 0.0925,
"step": 1768
},
{
"epoch": 0.708,
"grad_norm": 2.123792886734009,
"learning_rate": 4.76014094029921e-06,
"loss": 0.9384,
"step": 1770
},
{
"epoch": 0.7088,
"grad_norm": 0.10990063846111298,
"learning_rate": 4.736376699023023e-06,
"loss": 0.0253,
"step": 1772
},
{
"epoch": 0.7096,
"grad_norm": 0.3349180221557617,
"learning_rate": 4.712653504538684e-06,
"loss": 0.195,
"step": 1774
},
{
"epoch": 0.7104,
"grad_norm": 0.20643967390060425,
"learning_rate": 4.688971541844436e-06,
"loss": 0.0572,
"step": 1776
},
{
"epoch": 0.7112,
"grad_norm": 0.09784634411334991,
"learning_rate": 4.6653309956169745e-06,
"loss": 0.0524,
"step": 1778
},
{
"epoch": 0.712,
"grad_norm": 0.34553948044776917,
"learning_rate": 4.641732050210032e-06,
"loss": 0.0434,
"step": 1780
},
{
"epoch": 0.7128,
"grad_norm": 1.177024483680725,
"learning_rate": 4.618174889652928e-06,
"loss": 0.2077,
"step": 1782
},
{
"epoch": 0.7136,
"grad_norm": 0.10425437986850739,
"learning_rate": 4.59465969764913e-06,
"loss": 0.0273,
"step": 1784
},
{
"epoch": 0.7144,
"grad_norm": 0.09616199135780334,
"learning_rate": 4.571186657574828e-06,
"loss": 0.0605,
"step": 1786
},
{
"epoch": 0.7152,
"grad_norm": 0.3832583427429199,
"learning_rate": 4.5477559524775e-06,
"loss": 0.0672,
"step": 1788
},
{
"epoch": 0.716,
"grad_norm": 0.02167525887489319,
"learning_rate": 4.524367765074499e-06,
"loss": 0.0105,
"step": 1790
},
{
"epoch": 0.7168,
"grad_norm": 0.04896343871951103,
"learning_rate": 4.501022277751602e-06,
"loss": 0.0176,
"step": 1792
},
{
"epoch": 0.7176,
"grad_norm": 0.03762371465563774,
"learning_rate": 4.477719672561615e-06,
"loss": 0.0242,
"step": 1794
},
{
"epoch": 0.7184,
"grad_norm": 3.2954201698303223,
"learning_rate": 4.4544601312229295e-06,
"loss": 0.3976,
"step": 1796
},
{
"epoch": 0.7192,
"grad_norm": 2.2384305000305176,
"learning_rate": 4.4312438351181246e-06,
"loss": 0.2907,
"step": 1798
},
{
"epoch": 0.72,
"grad_norm": 0.017522338777780533,
"learning_rate": 4.408070965292534e-06,
"loss": 0.4971,
"step": 1800
},
{
"epoch": 0.7208,
"grad_norm": 0.04784021154046059,
"learning_rate": 4.384941702452856e-06,
"loss": 0.6106,
"step": 1802
},
{
"epoch": 0.7216,
"grad_norm": 1.3355712890625,
"learning_rate": 4.361856226965733e-06,
"loss": 0.1558,
"step": 1804
},
{
"epoch": 0.7224,
"grad_norm": 0.026937812566757202,
"learning_rate": 4.338814718856333e-06,
"loss": 0.0143,
"step": 1806
},
{
"epoch": 0.7232,
"grad_norm": 0.14190848171710968,
"learning_rate": 4.315817357806974e-06,
"loss": 0.4526,
"step": 1808
},
{
"epoch": 0.724,
"grad_norm": 0.10695420950651169,
"learning_rate": 4.292864323155684e-06,
"loss": 0.0318,
"step": 1810
},
{
"epoch": 0.7248,
"grad_norm": 0.988207221031189,
"learning_rate": 4.26995579389485e-06,
"loss": 0.1699,
"step": 1812
},
{
"epoch": 0.7256,
"grad_norm": 0.03641045466065407,
"learning_rate": 4.247091948669775e-06,
"loss": 0.0399,
"step": 1814
},
{
"epoch": 0.7264,
"grad_norm": 0.032800428569316864,
"learning_rate": 4.224272965777326e-06,
"loss": 0.015,
"step": 1816
},
{
"epoch": 0.7272,
"grad_norm": 0.0268462635576725,
"learning_rate": 4.201499023164508e-06,
"loss": 0.0272,
"step": 1818
},
{
"epoch": 0.728,
"grad_norm": 0.05160361900925636,
"learning_rate": 4.178770298427107e-06,
"loss": 0.068,
"step": 1820
},
{
"epoch": 0.7288,
"grad_norm": 1.727393627166748,
"learning_rate": 4.15608696880828e-06,
"loss": 0.3799,
"step": 1822
},
{
"epoch": 0.7296,
"grad_norm": 0.08573580533266068,
"learning_rate": 4.133449211197188e-06,
"loss": 0.0329,
"step": 1824
},
{
"epoch": 0.7304,
"grad_norm": 0.025401653721928596,
"learning_rate": 4.110857202127615e-06,
"loss": 0.0221,
"step": 1826
},
{
"epoch": 0.7312,
"grad_norm": 0.37113156914711,
"learning_rate": 4.08831111777658e-06,
"loss": 0.0818,
"step": 1828
},
{
"epoch": 0.732,
"grad_norm": 0.06260918080806732,
"learning_rate": 4.065811133962987e-06,
"loss": 0.0284,
"step": 1830
},
{
"epoch": 0.7328,
"grad_norm": 0.09040958434343338,
"learning_rate": 4.04335742614622e-06,
"loss": 0.0748,
"step": 1832
},
{
"epoch": 0.7336,
"grad_norm": 0.14594818651676178,
"learning_rate": 4.020950169424815e-06,
"loss": 0.0702,
"step": 1834
},
{
"epoch": 0.7344,
"grad_norm": 0.03950029984116554,
"learning_rate": 3.998589538535046e-06,
"loss": 0.0142,
"step": 1836
},
{
"epoch": 0.7352,
"grad_norm": 0.07695923000574112,
"learning_rate": 3.976275707849616e-06,
"loss": 0.0313,
"step": 1838
},
{
"epoch": 0.736,
"grad_norm": 0.02458553947508335,
"learning_rate": 3.954008851376252e-06,
"loss": 0.0769,
"step": 1840
},
{
"epoch": 0.7368,
"grad_norm": 0.6668182611465454,
"learning_rate": 3.931789142756377e-06,
"loss": 0.1822,
"step": 1842
},
{
"epoch": 0.7376,
"grad_norm": 1.2924126386642456,
"learning_rate": 3.9096167552637454e-06,
"loss": 0.1353,
"step": 1844
},
{
"epoch": 0.7384,
"grad_norm": 0.021815890446305275,
"learning_rate": 3.887491861803085e-06,
"loss": 0.0674,
"step": 1846
},
{
"epoch": 0.7392,
"grad_norm": 0.043568458408117294,
"learning_rate": 3.86541463490876e-06,
"loss": 0.0284,
"step": 1848
},
{
"epoch": 0.74,
"grad_norm": 0.013898522593080997,
"learning_rate": 3.8433852467434175e-06,
"loss": 0.0049,
"step": 1850
},
{
"epoch": 0.7408,
"grad_norm": 0.03106304258108139,
"learning_rate": 3.821403869096658e-06,
"loss": 0.2808,
"step": 1852
},
{
"epoch": 0.7416,
"grad_norm": 0.0395389050245285,
"learning_rate": 3.7994706733836738e-06,
"loss": 0.0115,
"step": 1854
},
{
"epoch": 0.7424,
"grad_norm": 0.027892421931028366,
"learning_rate": 3.7775858306439374e-06,
"loss": 0.0126,
"step": 1856
},
{
"epoch": 0.7432,
"grad_norm": 0.04609684646129608,
"learning_rate": 3.7557495115398446e-06,
"loss": 0.0117,
"step": 1858
},
{
"epoch": 0.744,
"grad_norm": 0.218703493475914,
"learning_rate": 3.7339618863553983e-06,
"loss": 0.042,
"step": 1860
},
{
"epoch": 0.7448,
"grad_norm": 0.03932720422744751,
"learning_rate": 3.7122231249948747e-06,
"loss": 0.0253,
"step": 1862
},
{
"epoch": 0.7456,
"grad_norm": 0.04857787489891052,
"learning_rate": 3.6905333969815038e-06,
"loss": 0.0178,
"step": 1864
},
{
"epoch": 0.7464,
"grad_norm": 0.12067477405071259,
"learning_rate": 3.6688928714561444e-06,
"loss": 0.0707,
"step": 1866
},
{
"epoch": 0.7472,
"grad_norm": 2.37345814704895,
"learning_rate": 3.6473017171759563e-06,
"loss": 0.4737,
"step": 1868
},
{
"epoch": 0.748,
"grad_norm": 0.13440009951591492,
"learning_rate": 3.625760102513103e-06,
"loss": 0.0347,
"step": 1870
},
{
"epoch": 0.7488,
"grad_norm": 3.3364574909210205,
"learning_rate": 3.604268195453421e-06,
"loss": 0.1831,
"step": 1872
},
{
"epoch": 0.7496,
"grad_norm": 0.6941204071044922,
"learning_rate": 3.582826163595119e-06,
"loss": 0.0828,
"step": 1874
},
{
"epoch": 0.7504,
"grad_norm": 2.122856616973877,
"learning_rate": 3.5614341741474633e-06,
"loss": 0.6934,
"step": 1876
},
{
"epoch": 0.7512,
"grad_norm": 1.272242546081543,
"learning_rate": 3.540092393929494e-06,
"loss": 0.1293,
"step": 1878
},
{
"epoch": 0.752,
"grad_norm": 0.18253956735134125,
"learning_rate": 3.5188009893686916e-06,
"loss": 0.3014,
"step": 1880
},
{
"epoch": 0.7528,
"grad_norm": 0.1802951991558075,
"learning_rate": 3.4975601264997094e-06,
"loss": 0.0245,
"step": 1882
},
{
"epoch": 0.7536,
"grad_norm": 0.06385163217782974,
"learning_rate": 3.476369970963072e-06,
"loss": 0.3051,
"step": 1884
},
{
"epoch": 0.7544,
"grad_norm": 0.04664032533764839,
"learning_rate": 3.455230688003852e-06,
"loss": 0.0243,
"step": 1886
},
{
"epoch": 0.7552,
"grad_norm": 0.09623868763446808,
"learning_rate": 3.4341424424704373e-06,
"loss": 0.0285,
"step": 1888
},
{
"epoch": 0.756,
"grad_norm": 0.12530668079853058,
"learning_rate": 3.4131053988131947e-06,
"loss": 0.1282,
"step": 1890
},
{
"epoch": 0.7568,
"grad_norm": 0.03179166465997696,
"learning_rate": 3.3921197210832235e-06,
"loss": 0.0231,
"step": 1892
},
{
"epoch": 0.7576,
"grad_norm": 0.3028177320957184,
"learning_rate": 3.3711855729310482e-06,
"loss": 0.0521,
"step": 1894
},
{
"epoch": 0.7584,
"grad_norm": 0.01785534806549549,
"learning_rate": 3.3503031176053657e-06,
"loss": 0.0962,
"step": 1896
},
{
"epoch": 0.7592,
"grad_norm": 0.4004635214805603,
"learning_rate": 3.3294725179517573e-06,
"loss": 0.0461,
"step": 1898
},
{
"epoch": 0.76,
"grad_norm": 0.02001468650996685,
"learning_rate": 3.308693936411421e-06,
"loss": 0.0133,
"step": 1900
},
{
"epoch": 0.7608,
"grad_norm": 0.019235266372561455,
"learning_rate": 3.287967535019908e-06,
"loss": 0.006,
"step": 1902
},
{
"epoch": 0.7616,
"grad_norm": 0.0368565134704113,
"learning_rate": 3.2672934754058615e-06,
"loss": 0.0194,
"step": 1904
},
{
"epoch": 0.7624,
"grad_norm": 0.045688826590776443,
"learning_rate": 3.2466719187897555e-06,
"loss": 0.0148,
"step": 1906
},
{
"epoch": 0.7632,
"grad_norm": 0.1634262502193451,
"learning_rate": 3.2261030259826287e-06,
"loss": 0.0357,
"step": 1908
},
{
"epoch": 0.764,
"grad_norm": 0.03429366648197174,
"learning_rate": 3.2055869573848374e-06,
"loss": 0.1542,
"step": 1910
},
{
"epoch": 0.7648,
"grad_norm": 0.04100421443581581,
"learning_rate": 3.1851238729848033e-06,
"loss": 0.0103,
"step": 1912
},
{
"epoch": 0.7656,
"grad_norm": 0.07986348867416382,
"learning_rate": 3.164713932357776e-06,
"loss": 0.0197,
"step": 1914
},
{
"epoch": 0.7664,
"grad_norm": 0.015995606780052185,
"learning_rate": 3.144357294664565e-06,
"loss": 0.0145,
"step": 1916
},
{
"epoch": 0.7672,
"grad_norm": 1.615556240081787,
"learning_rate": 3.124054118650327e-06,
"loss": 0.7627,
"step": 1918
},
{
"epoch": 0.768,
"grad_norm": 0.1365402489900589,
"learning_rate": 3.103804562643302e-06,
"loss": 0.04,
"step": 1920
},
{
"epoch": 0.7688,
"grad_norm": 0.2867192327976227,
"learning_rate": 3.0836087845536e-06,
"loss": 0.0462,
"step": 1922
},
{
"epoch": 0.7696,
"grad_norm": 1.3241535425186157,
"learning_rate": 3.063466941871952e-06,
"loss": 0.6603,
"step": 1924
},
{
"epoch": 0.7704,
"grad_norm": 0.1195574626326561,
"learning_rate": 3.043379191668492e-06,
"loss": 0.3676,
"step": 1926
},
{
"epoch": 0.7712,
"grad_norm": 1.3540464639663696,
"learning_rate": 3.023345690591537e-06,
"loss": 0.0907,
"step": 1928
},
{
"epoch": 0.772,
"grad_norm": 0.03964836522936821,
"learning_rate": 3.003366594866345e-06,
"loss": 0.1992,
"step": 1930
},
{
"epoch": 0.7728,
"grad_norm": 0.16762642562389374,
"learning_rate": 2.983442060293926e-06,
"loss": 0.182,
"step": 1932
},
{
"epoch": 0.7736,
"grad_norm": 0.02284320630133152,
"learning_rate": 2.963572242249799e-06,
"loss": 0.0122,
"step": 1934
},
{
"epoch": 0.7744,
"grad_norm": 0.05577899515628815,
"learning_rate": 2.9437572956827965e-06,
"loss": 0.039,
"step": 1936
},
{
"epoch": 0.7752,
"grad_norm": 0.08313470333814621,
"learning_rate": 2.9239973751138495e-06,
"loss": 0.0276,
"step": 1938
},
{
"epoch": 0.776,
"grad_norm": 0.06292706727981567,
"learning_rate": 2.9042926346347932e-06,
"loss": 0.0117,
"step": 1940
},
{
"epoch": 0.7768,
"grad_norm": 2.362842559814453,
"learning_rate": 2.884643227907147e-06,
"loss": 1.182,
"step": 1942
},
{
"epoch": 0.7776,
"grad_norm": 1.178203821182251,
"learning_rate": 2.8650493081609344e-06,
"loss": 0.312,
"step": 1944
},
{
"epoch": 0.7784,
"grad_norm": 0.3593979775905609,
"learning_rate": 2.8455110281934804e-06,
"loss": 0.0304,
"step": 1946
},
{
"epoch": 0.7792,
"grad_norm": 0.2723308205604553,
"learning_rate": 2.8260285403682153e-06,
"loss": 0.0387,
"step": 1948
},
{
"epoch": 0.78,
"grad_norm": 0.010247028432786465,
"learning_rate": 2.8066019966134907e-06,
"loss": 0.0151,
"step": 1950
},
{
"epoch": 0.7808,
"grad_norm": 1.3679031133651733,
"learning_rate": 2.7872315484213954e-06,
"loss": 0.1826,
"step": 1952
},
{
"epoch": 0.7816,
"grad_norm": 0.024432742968201637,
"learning_rate": 2.7679173468465813e-06,
"loss": 0.0401,
"step": 1954
},
{
"epoch": 0.7824,
"grad_norm": 0.026111546903848648,
"learning_rate": 2.7486595425050667e-06,
"loss": 0.2,
"step": 1956
},
{
"epoch": 0.7832,
"grad_norm": 1.0953105688095093,
"learning_rate": 2.7294582855730835e-06,
"loss": 0.4462,
"step": 1958
},
{
"epoch": 0.784,
"grad_norm": 2.070159673690796,
"learning_rate": 2.7103137257858867e-06,
"loss": 0.5674,
"step": 1960
},
{
"epoch": 0.7848,
"grad_norm": 0.26604995131492615,
"learning_rate": 2.6912260124366007e-06,
"loss": 0.0452,
"step": 1962
},
{
"epoch": 0.7856,
"grad_norm": 0.9278347492218018,
"learning_rate": 2.672195294375045e-06,
"loss": 0.4562,
"step": 1964
},
{
"epoch": 0.7864,
"grad_norm": 0.44804051518440247,
"learning_rate": 2.6532217200065856e-06,
"loss": 0.1851,
"step": 1966
},
{
"epoch": 0.7872,
"grad_norm": 1.6803971529006958,
"learning_rate": 2.634305437290968e-06,
"loss": 0.4855,
"step": 1968
},
{
"epoch": 0.788,
"grad_norm": 0.14506347477436066,
"learning_rate": 2.615446593741161e-06,
"loss": 0.0346,
"step": 1970
},
{
"epoch": 0.7888,
"grad_norm": 0.01994413323700428,
"learning_rate": 2.596645336422219e-06,
"loss": 0.0191,
"step": 1972
},
{
"epoch": 0.7896,
"grad_norm": 1.5493515729904175,
"learning_rate": 2.577901811950121e-06,
"loss": 0.4661,
"step": 1974
},
{
"epoch": 0.7904,
"grad_norm": 0.20212967693805695,
"learning_rate": 2.5592161664906366e-06,
"loss": 0.1535,
"step": 1976
},
{
"epoch": 0.7912,
"grad_norm": 0.20568671822547913,
"learning_rate": 2.5405885457581793e-06,
"loss": 0.0685,
"step": 1978
},
{
"epoch": 0.792,
"grad_norm": 0.2657265365123749,
"learning_rate": 2.522019095014683e-06,
"loss": 0.5426,
"step": 1980
},
{
"epoch": 0.7928,
"grad_norm": 0.39118292927742004,
"learning_rate": 2.5035079590684496e-06,
"loss": 0.0688,
"step": 1982
},
{
"epoch": 0.7936,
"grad_norm": 0.2671131193637848,
"learning_rate": 2.48505528227304e-06,
"loss": 0.0788,
"step": 1984
},
{
"epoch": 0.7944,
"grad_norm": 0.055703867226839066,
"learning_rate": 2.4666612085261344e-06,
"loss": 0.0482,
"step": 1986
},
{
"epoch": 0.7952,
"grad_norm": 0.03526290878653526,
"learning_rate": 2.4483258812684096e-06,
"loss": 0.0153,
"step": 1988
},
{
"epoch": 0.796,
"grad_norm": 1.1341354846954346,
"learning_rate": 2.4300494434824373e-06,
"loss": 0.6923,
"step": 1990
},
{
"epoch": 0.7968,
"grad_norm": 0.07913788408041,
"learning_rate": 2.411832037691545e-06,
"loss": 0.0221,
"step": 1992
},
{
"epoch": 0.7976,
"grad_norm": 0.10583814233541489,
"learning_rate": 2.3936738059587284e-06,
"loss": 0.0335,
"step": 1994
},
{
"epoch": 0.7984,
"grad_norm": 0.019925588741898537,
"learning_rate": 2.37557488988552e-06,
"loss": 0.0187,
"step": 1996
},
{
"epoch": 0.7992,
"grad_norm": 0.020932232961058617,
"learning_rate": 2.35753543061091e-06,
"loss": 0.0373,
"step": 1998
},
{
"epoch": 0.8,
"grad_norm": 0.03722110390663147,
"learning_rate": 2.339555568810221e-06,
"loss": 0.0309,
"step": 2000
},
{
"epoch": 0.8008,
"grad_norm": 0.6024539470672607,
"learning_rate": 2.321635444694028e-06,
"loss": 0.1057,
"step": 2002
},
{
"epoch": 0.8016,
"grad_norm": 0.28481170535087585,
"learning_rate": 2.3037751980070557e-06,
"loss": 0.0379,
"step": 2004
},
{
"epoch": 0.8024,
"grad_norm": 0.13168196380138397,
"learning_rate": 2.2859749680270983e-06,
"loss": 0.0436,
"step": 2006
},
{
"epoch": 0.8032,
"grad_norm": 0.02613903023302555,
"learning_rate": 2.2682348935639274e-06,
"loss": 0.0622,
"step": 2008
},
{
"epoch": 0.804,
"grad_norm": 0.3884766399860382,
"learning_rate": 2.2505551129582047e-06,
"loss": 0.0609,
"step": 2010
},
{
"epoch": 0.8048,
"grad_norm": 0.014485559426248074,
"learning_rate": 2.2329357640804118e-06,
"loss": 0.024,
"step": 2012
},
{
"epoch": 0.8056,
"grad_norm": 0.09305766969919205,
"learning_rate": 2.215376984329767e-06,
"loss": 0.0157,
"step": 2014
},
{
"epoch": 0.8064,
"grad_norm": 0.23493258655071259,
"learning_rate": 2.1978789106331666e-06,
"loss": 0.1795,
"step": 2016
},
{
"epoch": 0.8072,
"grad_norm": 0.2084125131368637,
"learning_rate": 2.1804416794441e-06,
"loss": 0.0342,
"step": 2018
},
{
"epoch": 0.808,
"grad_norm": 0.08282342553138733,
"learning_rate": 2.163065426741603e-06,
"loss": 0.0217,
"step": 2020
},
{
"epoch": 0.8088,
"grad_norm": 0.10017091780900955,
"learning_rate": 2.1457502880291815e-06,
"loss": 0.3079,
"step": 2022
},
{
"epoch": 0.8096,
"grad_norm": 0.11336953938007355,
"learning_rate": 2.128496398333768e-06,
"loss": 0.0351,
"step": 2024
},
{
"epoch": 0.8104,
"grad_norm": 2.011279344558716,
"learning_rate": 2.1113038922046603e-06,
"loss": 0.6162,
"step": 2026
},
{
"epoch": 0.8112,
"grad_norm": 0.02046182006597519,
"learning_rate": 2.09417290371247e-06,
"loss": 0.0658,
"step": 2028
},
{
"epoch": 0.812,
"grad_norm": 0.0351959727704525,
"learning_rate": 2.0771035664480944e-06,
"loss": 0.7813,
"step": 2030
},
{
"epoch": 0.8128,
"grad_norm": 0.031053269281983376,
"learning_rate": 2.0600960135216463e-06,
"loss": 0.0579,
"step": 2032
},
{
"epoch": 0.8136,
"grad_norm": 0.027455810457468033,
"learning_rate": 2.0431503775614457e-06,
"loss": 0.0735,
"step": 2034
},
{
"epoch": 0.8144,
"grad_norm": 0.030600672587752342,
"learning_rate": 2.026266790712965e-06,
"loss": 0.0096,
"step": 2036
},
{
"epoch": 0.8152,
"grad_norm": 0.11856792867183685,
"learning_rate": 2.009445384637805e-06,
"loss": 0.0198,
"step": 2038
},
{
"epoch": 0.816,
"grad_norm": 0.0909661203622818,
"learning_rate": 1.9926862905126663e-06,
"loss": 0.0384,
"step": 2040
},
{
"epoch": 0.8168,
"grad_norm": 0.008265670388936996,
"learning_rate": 1.9759896390283362e-06,
"loss": 0.092,
"step": 2042
},
{
"epoch": 0.8176,
"grad_norm": 0.035349734127521515,
"learning_rate": 1.959355560388654e-06,
"loss": 0.0277,
"step": 2044
},
{
"epoch": 0.8184,
"grad_norm": 0.058402419090270996,
"learning_rate": 1.9427841843095063e-06,
"loss": 0.0175,
"step": 2046
},
{
"epoch": 0.8192,
"grad_norm": 0.11960252374410629,
"learning_rate": 1.9262756400178163e-06,
"loss": 0.4295,
"step": 2048
},
{
"epoch": 0.82,
"grad_norm": 0.08432687819004059,
"learning_rate": 1.9098300562505266e-06,
"loss": 0.0149,
"step": 2050
},
{
"epoch": 0.8208,
"grad_norm": 1.8159247636795044,
"learning_rate": 1.8934475612536019e-06,
"loss": 0.2313,
"step": 2052
},
{
"epoch": 0.8216,
"grad_norm": 0.7769438028335571,
"learning_rate": 1.8771282827810278e-06,
"loss": 0.066,
"step": 2054
},
{
"epoch": 0.8224,
"grad_norm": 0.3301757276058197,
"learning_rate": 1.8608723480938207e-06,
"loss": 0.0571,
"step": 2056
},
{
"epoch": 0.8232,
"grad_norm": 0.009649352170526981,
"learning_rate": 1.8446798839590186e-06,
"loss": 0.0781,
"step": 2058
},
{
"epoch": 0.824,
"grad_norm": 0.014373435638844967,
"learning_rate": 1.8285510166487154e-06,
"loss": 0.0129,
"step": 2060
},
{
"epoch": 0.8248,
"grad_norm": 0.08050563931465149,
"learning_rate": 1.812485871939056e-06,
"loss": 0.3065,
"step": 2062
},
{
"epoch": 0.8256,
"grad_norm": 0.0180222075432539,
"learning_rate": 1.7964845751092663e-06,
"loss": 0.0089,
"step": 2064
},
{
"epoch": 0.8264,
"grad_norm": 0.15466023981571198,
"learning_rate": 1.7805472509406695e-06,
"loss": 0.0321,
"step": 2066
},
{
"epoch": 0.8272,
"grad_norm": 0.07331327348947525,
"learning_rate": 1.7646740237157256e-06,
"loss": 0.021,
"step": 2068
},
{
"epoch": 0.828,
"grad_norm": 0.1734917163848877,
"learning_rate": 1.7488650172170496e-06,
"loss": 0.0255,
"step": 2070
},
{
"epoch": 0.8288,
"grad_norm": 0.023930072784423828,
"learning_rate": 1.7331203547264452e-06,
"loss": 0.0887,
"step": 2072
},
{
"epoch": 0.8296,
"grad_norm": 0.027445461601018906,
"learning_rate": 1.7174401590239587e-06,
"loss": 0.1292,
"step": 2074
},
{
"epoch": 0.8304,
"grad_norm": 0.07482447475194931,
"learning_rate": 1.7018245523869038e-06,
"loss": 0.0384,
"step": 2076
},
{
"epoch": 0.8312,
"grad_norm": 0.03953075036406517,
"learning_rate": 1.686273656588917e-06,
"loss": 0.0082,
"step": 2078
},
{
"epoch": 0.832,
"grad_norm": 0.05215131863951683,
"learning_rate": 1.6707875928990059e-06,
"loss": 0.0412,
"step": 2080
},
{
"epoch": 0.8328,
"grad_norm": 0.061881113797426224,
"learning_rate": 1.6553664820806102e-06,
"loss": 0.04,
"step": 2082
},
{
"epoch": 0.8336,
"grad_norm": 0.1969269961118698,
"learning_rate": 1.6400104443906463e-06,
"loss": 0.0356,
"step": 2084
},
{
"epoch": 0.8344,
"grad_norm": 0.0532572865486145,
"learning_rate": 1.6247195995785836e-06,
"loss": 0.0115,
"step": 2086
},
{
"epoch": 0.8352,
"grad_norm": 0.07264053821563721,
"learning_rate": 1.6094940668855008e-06,
"loss": 0.0126,
"step": 2088
},
{
"epoch": 0.836,
"grad_norm": 1.00359308719635,
"learning_rate": 1.5943339650431578e-06,
"loss": 0.1388,
"step": 2090
},
{
"epoch": 0.8368,
"grad_norm": 0.0406976044178009,
"learning_rate": 1.579239412273078e-06,
"loss": 0.0194,
"step": 2092
},
{
"epoch": 0.8376,
"grad_norm": 0.02532949112355709,
"learning_rate": 1.5642105262856122e-06,
"loss": 0.0081,
"step": 2094
},
{
"epoch": 0.8384,
"grad_norm": 0.10549493879079819,
"learning_rate": 1.5492474242790368e-06,
"loss": 0.075,
"step": 2096
},
{
"epoch": 0.8392,
"grad_norm": 0.029608091339468956,
"learning_rate": 1.5343502229386209e-06,
"loss": 0.022,
"step": 2098
},
{
"epoch": 0.84,
"grad_norm": 2.5157110691070557,
"learning_rate": 1.5195190384357405e-06,
"loss": 0.1403,
"step": 2100
},
{
"epoch": 0.8408,
"grad_norm": 0.13249818980693817,
"learning_rate": 1.5047539864269477e-06,
"loss": 0.1111,
"step": 2102
},
{
"epoch": 0.8416,
"grad_norm": 0.09822747856378555,
"learning_rate": 1.490055182053083e-06,
"loss": 0.0193,
"step": 2104
},
{
"epoch": 0.8424,
"grad_norm": 0.7339903712272644,
"learning_rate": 1.4754227399383758e-06,
"loss": 0.0544,
"step": 2106
},
{
"epoch": 0.8432,
"grad_norm": 0.03530171886086464,
"learning_rate": 1.4608567741895496e-06,
"loss": 0.0135,
"step": 2108
},
{
"epoch": 0.844,
"grad_norm": 0.5551167726516724,
"learning_rate": 1.446357398394934e-06,
"loss": 0.1178,
"step": 2110
},
{
"epoch": 0.8448,
"grad_norm": 0.15902993083000183,
"learning_rate": 1.4319247256235713e-06,
"loss": 0.1147,
"step": 2112
},
{
"epoch": 0.8456,
"grad_norm": 0.7204644083976746,
"learning_rate": 1.4175588684243447e-06,
"loss": 0.1095,
"step": 2114
},
{
"epoch": 0.8464,
"grad_norm": 0.05895598977804184,
"learning_rate": 1.40325993882509e-06,
"loss": 0.0297,
"step": 2116
},
{
"epoch": 0.8472,
"grad_norm": 0.9869866371154785,
"learning_rate": 1.3890280483317375e-06,
"loss": 0.1489,
"step": 2118
},
{
"epoch": 0.848,
"grad_norm": 0.005029854364693165,
"learning_rate": 1.3748633079274254e-06,
"loss": 0.1327,
"step": 2120
},
{
"epoch": 0.8488,
"grad_norm": 0.14541175961494446,
"learning_rate": 1.3607658280716474e-06,
"loss": 0.1304,
"step": 2122
},
{
"epoch": 0.8496,
"grad_norm": 0.051523029804229736,
"learning_rate": 1.3467357186993802e-06,
"loss": 0.0182,
"step": 2124
},
{
"epoch": 0.8504,
"grad_norm": 0.13095594942569733,
"learning_rate": 1.3327730892202384e-06,
"loss": 0.4018,
"step": 2126
},
{
"epoch": 0.8512,
"grad_norm": 0.07890293002128601,
"learning_rate": 1.3188780485176089e-06,
"loss": 0.0119,
"step": 2128
},
{
"epoch": 0.852,
"grad_norm": 0.0385555699467659,
"learning_rate": 1.30505070494781e-06,
"loss": 0.0285,
"step": 2130
},
{
"epoch": 0.8528,
"grad_norm": 0.022745564579963684,
"learning_rate": 1.2912911663392468e-06,
"loss": 0.0219,
"step": 2132
},
{
"epoch": 0.8536,
"grad_norm": 0.04321039468050003,
"learning_rate": 1.277599539991563e-06,
"loss": 0.061,
"step": 2134
},
{
"epoch": 0.8544,
"grad_norm": 0.014432685449719429,
"learning_rate": 1.2639759326748136e-06,
"loss": 0.0409,
"step": 2136
},
{
"epoch": 0.8552,
"grad_norm": 0.019150329753756523,
"learning_rate": 1.2504204506286244e-06,
"loss": 0.0051,
"step": 2138
},
{
"epoch": 0.856,
"grad_norm": 1.3626798391342163,
"learning_rate": 1.2369331995613664e-06,
"loss": 0.1418,
"step": 2140
},
{
"epoch": 0.8568,
"grad_norm": 0.02268528752028942,
"learning_rate": 1.223514284649331e-06,
"loss": 0.2405,
"step": 2142
},
{
"epoch": 0.8576,
"grad_norm": 0.5977320671081543,
"learning_rate": 1.210163810535917e-06,
"loss": 0.0792,
"step": 2144
},
{
"epoch": 0.8584,
"grad_norm": 0.01674368605017662,
"learning_rate": 1.196881881330798e-06,
"loss": 0.0047,
"step": 2146
},
{
"epoch": 0.8592,
"grad_norm": 0.22216765582561493,
"learning_rate": 1.1836686006091313e-06,
"loss": 0.4924,
"step": 2148
},
{
"epoch": 0.86,
"grad_norm": 0.04747781902551651,
"learning_rate": 1.1705240714107301e-06,
"loss": 0.0192,
"step": 2150
},
{
"epoch": 0.8608,
"grad_norm": 0.12776315212249756,
"learning_rate": 1.1574483962392768e-06,
"loss": 0.031,
"step": 2152
},
{
"epoch": 0.8616,
"grad_norm": 0.022700203582644463,
"learning_rate": 1.1444416770615118e-06,
"loss": 0.0062,
"step": 2154
},
{
"epoch": 0.8624,
"grad_norm": 0.07675009965896606,
"learning_rate": 1.1315040153064416e-06,
"loss": 0.0334,
"step": 2156
},
{
"epoch": 0.8632,
"grad_norm": 0.0223082285374403,
"learning_rate": 1.1186355118645552e-06,
"loss": 0.0168,
"step": 2158
},
{
"epoch": 0.864,
"grad_norm": 0.034395378082990646,
"learning_rate": 1.1058362670870248e-06,
"loss": 0.0086,
"step": 2160
},
{
"epoch": 0.8648,
"grad_norm": 0.015471206046640873,
"learning_rate": 1.093106380784934e-06,
"loss": 0.1392,
"step": 2162
},
{
"epoch": 0.8656,
"grad_norm": 0.013013385236263275,
"learning_rate": 1.0804459522284927e-06,
"loss": 0.0584,
"step": 2164
},
{
"epoch": 0.8664,
"grad_norm": 0.03496384248137474,
"learning_rate": 1.0678550801462662e-06,
"loss": 0.0052,
"step": 2166
},
{
"epoch": 0.8672,
"grad_norm": 0.25604620575904846,
"learning_rate": 1.0553338627244026e-06,
"loss": 0.0329,
"step": 2168
},
{
"epoch": 0.868,
"grad_norm": 1.7641575336456299,
"learning_rate": 1.042882397605871e-06,
"loss": 0.1352,
"step": 2170
},
{
"epoch": 0.8688,
"grad_norm": 0.083366259932518,
"learning_rate": 1.0305007818897006e-06,
"loss": 0.0171,
"step": 2172
},
{
"epoch": 0.8696,
"grad_norm": 1.2676173448562622,
"learning_rate": 1.0181891121302145e-06,
"loss": 1.594,
"step": 2174
},
{
"epoch": 0.8704,
"grad_norm": 0.604960560798645,
"learning_rate": 1.0059474843362893e-06,
"loss": 0.2283,
"step": 2176
},
{
"epoch": 0.8712,
"grad_norm": 0.16328264772891998,
"learning_rate": 9.93775993970597e-07,
"loss": 0.0237,
"step": 2178
},
{
"epoch": 0.872,
"grad_norm": 0.01555766724050045,
"learning_rate": 9.816747359488632e-07,
"loss": 0.1311,
"step": 2180
},
{
"epoch": 0.8728,
"grad_norm": 0.6382170915603638,
"learning_rate": 9.696438046391288e-07,
"loss": 0.0781,
"step": 2182
},
{
"epoch": 0.8736,
"grad_norm": 0.1925554871559143,
"learning_rate": 9.576832938610137e-07,
"loss": 0.057,
"step": 2184
},
{
"epoch": 0.8744,
"grad_norm": 0.08132878690958023,
"learning_rate": 9.457932968849826e-07,
"loss": 0.0189,
"step": 2186
},
{
"epoch": 0.8752,
"grad_norm": 0.036015480756759644,
"learning_rate": 9.339739064316233e-07,
"loss": 0.1586,
"step": 2188
},
{
"epoch": 0.876,
"grad_norm": 1.3472023010253906,
"learning_rate": 9.222252146709143e-07,
"loss": 0.4576,
"step": 2190
},
{
"epoch": 0.8768,
"grad_norm": 2.6704301834106445,
"learning_rate": 9.105473132215126e-07,
"loss": 0.1486,
"step": 2192
},
{
"epoch": 0.8776,
"grad_norm": 0.029902072623372078,
"learning_rate": 8.989402931500434e-07,
"loss": 0.032,
"step": 2194
},
{
"epoch": 0.8784,
"grad_norm": 0.036093611270189285,
"learning_rate": 8.874042449703779e-07,
"loss": 0.0153,
"step": 2196
},
{
"epoch": 0.8792,
"grad_norm": 0.6965168118476868,
"learning_rate": 8.759392586429394e-07,
"loss": 0.0854,
"step": 2198
},
{
"epoch": 0.88,
"grad_norm": 0.3597988188266754,
"learning_rate": 8.645454235739903e-07,
"loss": 0.0457,
"step": 2200
},
{
"epoch": 0.8808,
"grad_norm": 0.15428324043750763,
"learning_rate": 8.532228286149502e-07,
"loss": 0.0202,
"step": 2202
},
{
"epoch": 0.8816,
"grad_norm": 0.1308208853006363,
"learning_rate": 8.419715620616875e-07,
"loss": 0.0235,
"step": 2204
},
{
"epoch": 0.8824,
"grad_norm": 0.8089284300804138,
"learning_rate": 8.307917116538378e-07,
"loss": 0.0771,
"step": 2206
},
{
"epoch": 0.8832,
"grad_norm": 1.2626261711120605,
"learning_rate": 8.196833645741187e-07,
"loss": 0.6288,
"step": 2208
},
{
"epoch": 0.884,
"grad_norm": 0.0954088419675827,
"learning_rate": 8.086466074476562e-07,
"loss": 0.0133,
"step": 2210
},
{
"epoch": 0.8848,
"grad_norm": 0.13527199625968933,
"learning_rate": 7.976815263412963e-07,
"loss": 0.0211,
"step": 2212
},
{
"epoch": 0.8856,
"grad_norm": 0.5545380115509033,
"learning_rate": 7.867882067629473e-07,
"loss": 0.0581,
"step": 2214
},
{
"epoch": 0.8864,
"grad_norm": 0.06293027848005295,
"learning_rate": 7.759667336609011e-07,
"loss": 0.0264,
"step": 2216
},
{
"epoch": 0.8872,
"grad_norm": 0.3431554138660431,
"learning_rate": 7.652171914231777e-07,
"loss": 0.0329,
"step": 2218
},
{
"epoch": 0.888,
"grad_norm": 0.00832755770534277,
"learning_rate": 7.545396638768698e-07,
"loss": 0.0099,
"step": 2220
},
{
"epoch": 0.8888,
"grad_norm": 2.4031519889831543,
"learning_rate": 7.439342342874789e-07,
"loss": 0.1471,
"step": 2222
},
{
"epoch": 0.8896,
"grad_norm": 0.06904103606939316,
"learning_rate": 7.334009853582791e-07,
"loss": 0.0304,
"step": 2224
},
{
"epoch": 0.8904,
"grad_norm": 0.24774344265460968,
"learning_rate": 7.22939999229657e-07,
"loss": 0.0465,
"step": 2226
},
{
"epoch": 0.8912,
"grad_norm": 1.129273533821106,
"learning_rate": 7.125513574784904e-07,
"loss": 0.1176,
"step": 2228
},
{
"epoch": 0.892,
"grad_norm": 0.13521403074264526,
"learning_rate": 7.022351411174866e-07,
"loss": 0.0401,
"step": 2230
},
{
"epoch": 0.8928,
"grad_norm": 0.025177139788866043,
"learning_rate": 6.919914305945774e-07,
"loss": 0.0051,
"step": 2232
},
{
"epoch": 0.8936,
"grad_norm": 2.7186474800109863,
"learning_rate": 6.818203057922756e-07,
"loss": 0.6471,
"step": 2234
},
{
"epoch": 0.8944,
"grad_norm": 0.5871244072914124,
"learning_rate": 6.717218460270536e-07,
"loss": 0.0628,
"step": 2236
},
{
"epoch": 0.8952,
"grad_norm": 0.3472304344177246,
"learning_rate": 6.616961300487323e-07,
"loss": 0.0305,
"step": 2238
},
{
"epoch": 0.896,
"grad_norm": 0.04057146608829498,
"learning_rate": 6.517432360398556e-07,
"loss": 0.7828,
"step": 2240
},
{
"epoch": 0.8968,
"grad_norm": 0.03729462996125221,
"learning_rate": 6.418632416150927e-07,
"loss": 0.006,
"step": 2242
},
{
"epoch": 0.8976,
"grad_norm": 0.022776370868086815,
"learning_rate": 6.320562238206218e-07,
"loss": 0.0087,
"step": 2244
},
{
"epoch": 0.8984,
"grad_norm": 0.06814192980527878,
"learning_rate": 6.223222591335409e-07,
"loss": 0.0067,
"step": 2246
},
{
"epoch": 0.8992,
"grad_norm": 0.012981448322534561,
"learning_rate": 6.126614234612593e-07,
"loss": 0.0273,
"step": 2248
},
{
"epoch": 0.9,
"grad_norm": 0.7651333212852478,
"learning_rate": 6.030737921409169e-07,
"loss": 0.1298,
"step": 2250
},
{
"epoch": 0.9008,
"grad_norm": 0.5903016924858093,
"learning_rate": 5.935594399387856e-07,
"loss": 0.0611,
"step": 2252
},
{
"epoch": 0.9016,
"grad_norm": 0.06699836999177933,
"learning_rate": 5.841184410496992e-07,
"loss": 0.0083,
"step": 2254
},
{
"epoch": 0.9024,
"grad_norm": 0.124520443379879,
"learning_rate": 5.747508690964599e-07,
"loss": 0.3001,
"step": 2256
},
{
"epoch": 0.9032,
"grad_norm": 0.16008125245571136,
"learning_rate": 5.654567971292757e-07,
"loss": 0.0201,
"step": 2258
},
{
"epoch": 0.904,
"grad_norm": 0.07481832057237625,
"learning_rate": 5.562362976251901e-07,
"loss": 0.0851,
"step": 2260
},
{
"epoch": 0.9048,
"grad_norm": 0.6332103610038757,
"learning_rate": 5.470894424875062e-07,
"loss": 0.1097,
"step": 2262
},
{
"epoch": 0.9056,
"grad_norm": 0.014627913013100624,
"learning_rate": 5.380163030452412e-07,
"loss": 0.0236,
"step": 2264
},
{
"epoch": 0.9064,
"grad_norm": 0.6561143398284912,
"learning_rate": 5.290169500525577e-07,
"loss": 0.0778,
"step": 2266
},
{
"epoch": 0.9072,
"grad_norm": 0.017556993290781975,
"learning_rate": 5.200914536882184e-07,
"loss": 0.0088,
"step": 2268
},
{
"epoch": 0.908,
"grad_norm": 0.11898583173751831,
"learning_rate": 5.112398835550348e-07,
"loss": 0.0246,
"step": 2270
},
{
"epoch": 0.9088,
"grad_norm": 1.1768893003463745,
"learning_rate": 5.024623086793323e-07,
"loss": 0.0801,
"step": 2272
},
{
"epoch": 0.9096,
"grad_norm": 0.29487040638923645,
"learning_rate": 4.937587975103997e-07,
"loss": 0.0533,
"step": 2274
},
{
"epoch": 0.9104,
"grad_norm": 1.571694016456604,
"learning_rate": 4.851294179199673e-07,
"loss": 0.65,
"step": 2276
},
{
"epoch": 0.9112,
"grad_norm": 0.01853368431329727,
"learning_rate": 4.765742372016735e-07,
"loss": 0.0912,
"step": 2278
},
{
"epoch": 0.912,
"grad_norm": 1.2929413318634033,
"learning_rate": 4.6809332207053083e-07,
"loss": 0.0748,
"step": 2280
},
{
"epoch": 0.9128,
"grad_norm": 0.019635476171970367,
"learning_rate": 4.596867386624215e-07,
"loss": 0.0165,
"step": 2282
},
{
"epoch": 0.9136,
"grad_norm": 0.04413539171218872,
"learning_rate": 4.5135455253357053e-07,
"loss": 0.0407,
"step": 2284
},
{
"epoch": 0.9144,
"grad_norm": 0.022891348227858543,
"learning_rate": 4.4309682866004124e-07,
"loss": 0.0352,
"step": 2286
},
{
"epoch": 0.9152,
"grad_norm": 0.0767710953950882,
"learning_rate": 4.349136314372204e-07,
"loss": 0.0121,
"step": 2288
},
{
"epoch": 0.916,
"grad_norm": 0.13404303789138794,
"learning_rate": 4.268050246793276e-07,
"loss": 0.0556,
"step": 2290
},
{
"epoch": 0.9168,
"grad_norm": 0.4251805543899536,
"learning_rate": 4.1877107161890416e-07,
"loss": 0.1859,
"step": 2292
},
{
"epoch": 0.9176,
"grad_norm": 1.6532477140426636,
"learning_rate": 4.108118349063306e-07,
"loss": 0.6478,
"step": 2294
},
{
"epoch": 0.9184,
"grad_norm": 0.1750348061323166,
"learning_rate": 4.0292737660933335e-07,
"loss": 0.0384,
"step": 2296
},
{
"epoch": 0.9192,
"grad_norm": 1.260365605354309,
"learning_rate": 3.9511775821250206e-07,
"loss": 0.8857,
"step": 2298
},
{
"epoch": 0.92,
"grad_norm": 0.05385835841298103,
"learning_rate": 3.8738304061681107e-07,
"loss": 0.1621,
"step": 2300
},
{
"epoch": 0.9208,
"grad_norm": 0.4076821208000183,
"learning_rate": 3.7972328413914074e-07,
"loss": 0.1319,
"step": 2302
},
{
"epoch": 0.9216,
"grad_norm": 0.42220211029052734,
"learning_rate": 3.721385485118123e-07,
"loss": 0.2582,
"step": 2304
},
{
"epoch": 0.9224,
"grad_norm": 0.2544540762901306,
"learning_rate": 3.646288928821151e-07,
"loss": 0.0197,
"step": 2306
},
{
"epoch": 0.9232,
"grad_norm": 1.2406009435653687,
"learning_rate": 3.571943758118546e-07,
"loss": 0.8589,
"step": 2308
},
{
"epoch": 0.924,
"grad_norm": 0.11001749336719513,
"learning_rate": 3.498350552768859e-07,
"loss": 0.0317,
"step": 2310
},
{
"epoch": 0.9248,
"grad_norm": 0.5791686177253723,
"learning_rate": 3.4255098866667114e-07,
"loss": 0.0695,
"step": 2312
},
{
"epoch": 0.9256,
"grad_norm": 0.005229198839515448,
"learning_rate": 3.3534223278382405e-07,
"loss": 0.0093,
"step": 2314
},
{
"epoch": 0.9264,
"grad_norm": 0.09191671758890152,
"learning_rate": 3.282088438436715e-07,
"loss": 0.0201,
"step": 2316
},
{
"epoch": 0.9272,
"grad_norm": 0.32510700821876526,
"learning_rate": 3.211508774738137e-07,
"loss": 0.039,
"step": 2318
},
{
"epoch": 0.928,
"grad_norm": 0.027174528688192368,
"learning_rate": 3.1416838871368925e-07,
"loss": 0.0191,
"step": 2320
},
{
"epoch": 0.9288,
"grad_norm": 0.0871250182390213,
"learning_rate": 3.072614320141487e-07,
"loss": 0.0112,
"step": 2322
},
{
"epoch": 0.9296,
"grad_norm": 0.11660738289356232,
"learning_rate": 3.00430061237027e-07,
"loss": 0.0281,
"step": 2324
},
{
"epoch": 0.9304,
"grad_norm": 0.009191269055008888,
"learning_rate": 2.936743296547273e-07,
"loss": 0.0221,
"step": 2326
},
{
"epoch": 0.9312,
"grad_norm": 0.05838488042354584,
"learning_rate": 2.8699428994980017e-07,
"loss": 0.9408,
"step": 2328
},
{
"epoch": 0.932,
"grad_norm": 1.4220937490463257,
"learning_rate": 2.8038999421453827e-07,
"loss": 0.0656,
"step": 2330
},
{
"epoch": 0.9328,
"grad_norm": 0.08184941858053207,
"learning_rate": 2.7386149395056463e-07,
"loss": 0.4858,
"step": 2332
},
{
"epoch": 0.9336,
"grad_norm": 0.7450655102729797,
"learning_rate": 2.6740884006843826e-07,
"loss": 0.0379,
"step": 2334
},
{
"epoch": 0.9344,
"grad_norm": 1.6832029819488525,
"learning_rate": 2.6103208288724815e-07,
"loss": 0.6098,
"step": 2336
},
{
"epoch": 0.9352,
"grad_norm": 0.09961254149675369,
"learning_rate": 2.547312721342277e-07,
"loss": 0.0168,
"step": 2338
},
{
"epoch": 0.936,
"grad_norm": 0.10063952207565308,
"learning_rate": 2.4850645694436736e-07,
"loss": 0.0119,
"step": 2340
},
{
"epoch": 0.9368,
"grad_norm": 0.06440629810094833,
"learning_rate": 2.423576858600252e-07,
"loss": 0.0119,
"step": 2342
},
{
"epoch": 0.9376,
"grad_norm": 0.01774456538259983,
"learning_rate": 2.3628500683055222e-07,
"loss": 0.0081,
"step": 2344
},
{
"epoch": 0.9384,
"grad_norm": 1.3639814853668213,
"learning_rate": 2.3028846721191878e-07,
"loss": 0.1012,
"step": 2346
},
{
"epoch": 0.9392,
"grad_norm": 0.010448544286191463,
"learning_rate": 2.2436811376634893e-07,
"loss": 0.0061,
"step": 2348
},
{
"epoch": 0.94,
"grad_norm": 0.05712028965353966,
"learning_rate": 2.1852399266194312e-07,
"loss": 0.0325,
"step": 2350
},
{
"epoch": 0.9408,
"grad_norm": 0.027819665148854256,
"learning_rate": 2.1275614947233624e-07,
"loss": 0.0963,
"step": 2352
},
{
"epoch": 0.9416,
"grad_norm": 1.7772072553634644,
"learning_rate": 2.0706462917632676e-07,
"loss": 0.8716,
"step": 2354
},
{
"epoch": 0.9424,
"grad_norm": 0.47964930534362793,
"learning_rate": 2.014494761575314e-07,
"loss": 0.0422,
"step": 2356
},
{
"epoch": 0.9432,
"grad_norm": 0.0034853648394346237,
"learning_rate": 1.9591073420404338e-07,
"loss": 0.0195,
"step": 2358
},
{
"epoch": 0.944,
"grad_norm": 0.03955305367708206,
"learning_rate": 1.9044844650808468e-07,
"loss": 0.0275,
"step": 2360
},
{
"epoch": 0.9448,
"grad_norm": 0.5900484919548035,
"learning_rate": 1.8506265566567095e-07,
"loss": 0.032,
"step": 2362
},
{
"epoch": 0.9456,
"grad_norm": 0.006537347100675106,
"learning_rate": 1.7975340367628269e-07,
"loss": 0.0082,
"step": 2364
},
{
"epoch": 0.9464,
"grad_norm": 0.03216685727238655,
"learning_rate": 1.7452073194253237e-07,
"loss": 0.0099,
"step": 2366
},
{
"epoch": 0.9472,
"grad_norm": 0.06808136403560638,
"learning_rate": 1.6936468126984573e-07,
"loss": 0.0111,
"step": 2368
},
{
"epoch": 0.948,
"grad_norm": 0.19587433338165283,
"learning_rate": 1.6428529186614195e-07,
"loss": 0.0222,
"step": 2370
},
{
"epoch": 0.9488,
"grad_norm": 0.02673921547830105,
"learning_rate": 1.5928260334151847e-07,
"loss": 0.0362,
"step": 2372
},
{
"epoch": 0.9496,
"grad_norm": 0.8786045908927917,
"learning_rate": 1.543566547079467e-07,
"loss": 0.0973,
"step": 2374
},
{
"epoch": 0.9504,
"grad_norm": 0.16141577064990997,
"learning_rate": 1.4950748437896235e-07,
"loss": 0.0884,
"step": 2376
},
{
"epoch": 0.9512,
"grad_norm": 0.010312036611139774,
"learning_rate": 1.4473513016937223e-07,
"loss": 0.0103,
"step": 2378
},
{
"epoch": 0.952,
"grad_norm": 3.670624017715454,
"learning_rate": 1.400396292949513e-07,
"loss": 0.6124,
"step": 2380
},
{
"epoch": 0.9528,
"grad_norm": 3.3416731357574463,
"learning_rate": 1.3542101837215826e-07,
"loss": 1.0542,
"step": 2382
},
{
"epoch": 0.9536,
"grad_norm": 1.813645839691162,
"learning_rate": 1.308793334178493e-07,
"loss": 0.8749,
"step": 2384
},
{
"epoch": 0.9544,
"grad_norm": 0.02037736214697361,
"learning_rate": 1.26414609848996e-07,
"loss": 0.0147,
"step": 2386
},
{
"epoch": 0.9552,
"grad_norm": 0.29435980319976807,
"learning_rate": 1.2202688248241113e-07,
"loss": 0.0375,
"step": 2388
},
{
"epoch": 0.956,
"grad_norm": 0.02400418370962143,
"learning_rate": 1.1771618553447217e-07,
"loss": 0.0272,
"step": 2390
},
{
"epoch": 0.9568,
"grad_norm": 0.32467302680015564,
"learning_rate": 1.134825526208605e-07,
"loss": 0.039,
"step": 2392
},
{
"epoch": 0.9576,
"grad_norm": 0.04526238515973091,
"learning_rate": 1.0932601675629595e-07,
"loss": 0.0099,
"step": 2394
},
{
"epoch": 0.9584,
"grad_norm": 0.017281251028180122,
"learning_rate": 1.052466103542793e-07,
"loss": 0.0201,
"step": 2396
},
{
"epoch": 0.9592,
"grad_norm": 0.2014533132314682,
"learning_rate": 1.0124436522684244e-07,
"loss": 0.232,
"step": 2398
},
{
"epoch": 0.96,
"grad_norm": 0.08100484311580658,
"learning_rate": 9.731931258429638e-08,
"loss": 0.1188,
"step": 2400
},
{
"epoch": 0.9608,
"grad_norm": 0.03484244644641876,
"learning_rate": 9.347148303499143e-08,
"loss": 0.0144,
"step": 2402
},
{
"epoch": 0.9616,
"grad_norm": 0.1309373676776886,
"learning_rate": 8.970090658507291e-08,
"loss": 0.015,
"step": 2404
},
{
"epoch": 0.9624,
"grad_norm": 1.4030771255493164,
"learning_rate": 8.600761263825475e-08,
"loss": 0.5354,
"step": 2406
},
{
"epoch": 0.9632,
"grad_norm": 0.009812920354306698,
"learning_rate": 8.239162999558403e-08,
"loss": 0.0114,
"step": 2408
},
{
"epoch": 0.964,
"grad_norm": 1.2562764883041382,
"learning_rate": 7.885298685522235e-08,
"loss": 1.0972,
"step": 2410
},
{
"epoch": 0.9648,
"grad_norm": 0.012443533167243004,
"learning_rate": 7.539171081221597e-08,
"loss": 0.0077,
"step": 2412
},
{
"epoch": 0.9656,
"grad_norm": 0.011568997986614704,
"learning_rate": 7.200782885829482e-08,
"loss": 0.6151,
"step": 2414
},
{
"epoch": 0.9664,
"grad_norm": 0.09277435392141342,
"learning_rate": 6.870136738164612e-08,
"loss": 0.0186,
"step": 2416
},
{
"epoch": 0.9672,
"grad_norm": 0.1386393904685974,
"learning_rate": 6.547235216672443e-08,
"loss": 0.2091,
"step": 2418
},
{
"epoch": 0.968,
"grad_norm": 1.1379826068878174,
"learning_rate": 6.232080839403631e-08,
"loss": 0.2519,
"step": 2420
},
{
"epoch": 0.9688,
"grad_norm": 0.1122945249080658,
"learning_rate": 5.9246760639953824e-08,
"loss": 0.0173,
"step": 2422
},
{
"epoch": 0.9696,
"grad_norm": 0.12855499982833862,
"learning_rate": 5.625023287652021e-08,
"loss": 0.2261,
"step": 2424
},
{
"epoch": 0.9704,
"grad_norm": 0.007712547667324543,
"learning_rate": 5.3331248471258926e-08,
"loss": 0.0049,
"step": 2426
},
{
"epoch": 0.9712,
"grad_norm": 0.033568158745765686,
"learning_rate": 5.048983018699827e-08,
"loss": 0.054,
"step": 2428
},
{
"epoch": 0.972,
"grad_norm": 0.03975361958146095,
"learning_rate": 4.772600018168816e-08,
"loss": 0.0088,
"step": 2430
},
{
"epoch": 0.9728,
"grad_norm": 0.12293253093957901,
"learning_rate": 4.503978000823028e-08,
"loss": 0.0239,
"step": 2432
},
{
"epoch": 0.9736,
"grad_norm": 1.9904245138168335,
"learning_rate": 4.2431190614309334e-08,
"loss": 0.7805,
"step": 2434
},
{
"epoch": 0.9744,
"grad_norm": 0.08196962624788284,
"learning_rate": 3.990025234222872e-08,
"loss": 0.0215,
"step": 2436
},
{
"epoch": 0.9752,
"grad_norm": 0.015654845163226128,
"learning_rate": 3.7446984928753984e-08,
"loss": 0.0054,
"step": 2438
},
{
"epoch": 0.976,
"grad_norm": 0.4907855987548828,
"learning_rate": 3.50714075049563e-08,
"loss": 0.0643,
"step": 2440
},
{
"epoch": 0.9768,
"grad_norm": 0.11044461280107498,
"learning_rate": 3.2773538596068134e-08,
"loss": 0.017,
"step": 2442
},
{
"epoch": 0.9776,
"grad_norm": 0.12533003091812134,
"learning_rate": 3.0553396121330015e-08,
"loss": 0.8734,
"step": 2444
},
{
"epoch": 0.9784,
"grad_norm": 0.010446742177009583,
"learning_rate": 2.8410997393860663e-08,
"loss": 0.0066,
"step": 2446
},
{
"epoch": 0.9792,
"grad_norm": 2.9740476608276367,
"learning_rate": 2.6346359120514863e-08,
"loss": 0.56,
"step": 2448
},
{
"epoch": 0.98,
"grad_norm": 0.15533113479614258,
"learning_rate": 2.4359497401758026e-08,
"loss": 0.0152,
"step": 2450
},
{
"epoch": 0.9808,
"grad_norm": 0.42947232723236084,
"learning_rate": 2.2450427731534052e-08,
"loss": 0.1126,
"step": 2452
},
{
"epoch": 0.9816,
"grad_norm": 0.31456366181373596,
"learning_rate": 2.061916499715544e-08,
"loss": 0.0455,
"step": 2454
},
{
"epoch": 0.9824,
"grad_norm": 1.7186102867126465,
"learning_rate": 1.886572347917337e-08,
"loss": 0.1411,
"step": 2456
},
{
"epoch": 0.9832,
"grad_norm": 0.3335157632827759,
"learning_rate": 1.7190116851280024e-08,
"loss": 0.0393,
"step": 2458
},
{
"epoch": 0.984,
"grad_norm": 0.01097691897302866,
"learning_rate": 1.5592358180189782e-08,
"loss": 0.0299,
"step": 2460
},
{
"epoch": 0.9848,
"grad_norm": 0.1632501780986786,
"learning_rate": 1.4072459925548176e-08,
"loss": 0.0287,
"step": 2462
},
{
"epoch": 0.9856,
"grad_norm": 0.02019256353378296,
"learning_rate": 1.2630433939825326e-08,
"loss": 0.0658,
"step": 2464
},
{
"epoch": 0.9864,
"grad_norm": 2.6901440620422363,
"learning_rate": 1.126629146822933e-08,
"loss": 0.2091,
"step": 2466
},
{
"epoch": 0.9872,
"grad_norm": 0.05359053239226341,
"learning_rate": 9.980043148619668e-09,
"loss": 0.0161,
"step": 2468
},
{
"epoch": 0.988,
"grad_norm": 1.5825371742248535,
"learning_rate": 8.771699011416169e-09,
"loss": 1.7133,
"step": 2470
},
{
"epoch": 0.9888,
"grad_norm": 0.02139631099998951,
"learning_rate": 7.641268479531283e-09,
"loss": 0.0111,
"step": 2472
},
{
"epoch": 0.9896,
"grad_norm": 0.06577183306217194,
"learning_rate": 6.588760368287928e-09,
"loss": 0.0126,
"step": 2474
},
{
"epoch": 0.9904,
"grad_norm": 0.04376514256000519,
"learning_rate": 5.614182885357311e-09,
"loss": 0.0159,
"step": 2476
},
{
"epoch": 0.9912,
"grad_norm": 0.02068435214459896,
"learning_rate": 4.717543630688992e-09,
"loss": 0.0087,
"step": 2478
},
{
"epoch": 0.992,
"grad_norm": 2.170161724090576,
"learning_rate": 3.898849596456477e-09,
"loss": 0.2086,
"step": 2480
},
{
"epoch": 0.9928,
"grad_norm": 1.0947741270065308,
"learning_rate": 3.1581071670006013e-09,
"loss": 0.0886,
"step": 2482
},
{
"epoch": 0.9936,
"grad_norm": 0.08858716487884521,
"learning_rate": 2.495322118778454e-09,
"loss": 0.02,
"step": 2484
},
{
"epoch": 0.9944,
"grad_norm": 0.011349241249263287,
"learning_rate": 1.910499620322304e-09,
"loss": 0.0094,
"step": 2486
},
{
"epoch": 0.9952,
"grad_norm": 1.9726616144180298,
"learning_rate": 1.4036442321962995e-09,
"loss": 0.7641,
"step": 2488
},
{
"epoch": 0.996,
"grad_norm": 0.020813768729567528,
"learning_rate": 9.74759906957612e-10,
"loss": 0.0056,
"step": 2490
},
{
"epoch": 0.9968,
"grad_norm": 0.07518152892589569,
"learning_rate": 6.238499891353389e-10,
"loss": 0.0572,
"step": 2492
},
{
"epoch": 0.9976,
"grad_norm": 0.12918703258037567,
"learning_rate": 3.509172151938689e-10,
"loss": 0.0209,
"step": 2494
},
{
"epoch": 0.9984,
"grad_norm": 2.08217453956604,
"learning_rate": 1.559637135173375e-10,
"loss": 1.673,
"step": 2496
},
{
"epoch": 0.9992,
"grad_norm": 0.01897619105875492,
"learning_rate": 3.899100439408443e-11,
"loss": 0.0482,
"step": 2498
},
{
"epoch": 1.0,
"grad_norm": 0.07024825364351273,
"learning_rate": 0.0,
"loss": 0.1147,
"step": 2500
},
{
"epoch": 1.0,
"step": 2500,
"total_flos": 5.2791220503996006e+17,
"train_loss": 0.34746392381768676,
"train_runtime": 41909.1346,
"train_samples_per_second": 1.909,
"train_steps_per_second": 0.06
}
],
"logging_steps": 2,
"max_steps": 2500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {},
"total_flos": 5.2791220503996006e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}