opencodereasoning_100k / trainer_state.json
sedrickkeh's picture
Upload trainer_state.json with huggingface_hub
2814851 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 515,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009708737864077669,
"grad_norm": 6.002007543902212,
"learning_rate": 1.5384615384615387e-06,
"loss": 1.0598,
"step": 1
},
{
"epoch": 0.019417475728155338,
"grad_norm": 5.991740133700607,
"learning_rate": 3.0769230769230774e-06,
"loss": 1.0579,
"step": 2
},
{
"epoch": 0.02912621359223301,
"grad_norm": 5.825979646199088,
"learning_rate": 4.615384615384616e-06,
"loss": 1.0544,
"step": 3
},
{
"epoch": 0.038834951456310676,
"grad_norm": 4.364941447544986,
"learning_rate": 6.153846153846155e-06,
"loss": 1.0106,
"step": 4
},
{
"epoch": 0.04854368932038835,
"grad_norm": 2.7678687055700784,
"learning_rate": 7.692307692307694e-06,
"loss": 0.9739,
"step": 5
},
{
"epoch": 0.05825242718446602,
"grad_norm": 2.5422026912598827,
"learning_rate": 9.230769230769232e-06,
"loss": 0.948,
"step": 6
},
{
"epoch": 0.06796116504854369,
"grad_norm": 3.961951944224408,
"learning_rate": 1.076923076923077e-05,
"loss": 0.9572,
"step": 7
},
{
"epoch": 0.07766990291262135,
"grad_norm": 3.5883110693606577,
"learning_rate": 1.230769230769231e-05,
"loss": 0.9112,
"step": 8
},
{
"epoch": 0.08737864077669903,
"grad_norm": 3.860767785745949,
"learning_rate": 1.3846153846153847e-05,
"loss": 0.9007,
"step": 9
},
{
"epoch": 0.0970873786407767,
"grad_norm": 2.7156118144668633,
"learning_rate": 1.5384615384615387e-05,
"loss": 0.8734,
"step": 10
},
{
"epoch": 0.10679611650485436,
"grad_norm": 1.9289111077886563,
"learning_rate": 1.6923076923076924e-05,
"loss": 0.8391,
"step": 11
},
{
"epoch": 0.11650485436893204,
"grad_norm": 1.8123247995331513,
"learning_rate": 1.8461538461538465e-05,
"loss": 0.8171,
"step": 12
},
{
"epoch": 0.1262135922330097,
"grad_norm": 1.428428989967321,
"learning_rate": 2e-05,
"loss": 0.7956,
"step": 13
},
{
"epoch": 0.13592233009708737,
"grad_norm": 1.3680143343907647,
"learning_rate": 2.153846153846154e-05,
"loss": 0.7813,
"step": 14
},
{
"epoch": 0.14563106796116504,
"grad_norm": 1.3879441764342295,
"learning_rate": 2.3076923076923076e-05,
"loss": 0.7681,
"step": 15
},
{
"epoch": 0.1553398058252427,
"grad_norm": 1.0796522703025953,
"learning_rate": 2.461538461538462e-05,
"loss": 0.7574,
"step": 16
},
{
"epoch": 0.1650485436893204,
"grad_norm": 1.371942931930126,
"learning_rate": 2.6153846153846157e-05,
"loss": 0.748,
"step": 17
},
{
"epoch": 0.17475728155339806,
"grad_norm": 1.410433870930946,
"learning_rate": 2.7692307692307694e-05,
"loss": 0.7432,
"step": 18
},
{
"epoch": 0.18446601941747573,
"grad_norm": 1.0096190041268163,
"learning_rate": 2.923076923076923e-05,
"loss": 0.7329,
"step": 19
},
{
"epoch": 0.1941747572815534,
"grad_norm": 1.7265065580167445,
"learning_rate": 3.0769230769230774e-05,
"loss": 0.7295,
"step": 20
},
{
"epoch": 0.20388349514563106,
"grad_norm": 1.1032179992161404,
"learning_rate": 3.230769230769231e-05,
"loss": 0.7245,
"step": 21
},
{
"epoch": 0.21359223300970873,
"grad_norm": 1.0901184558146035,
"learning_rate": 3.384615384615385e-05,
"loss": 0.7183,
"step": 22
},
{
"epoch": 0.22330097087378642,
"grad_norm": 1.7732487418217808,
"learning_rate": 3.538461538461539e-05,
"loss": 0.7136,
"step": 23
},
{
"epoch": 0.23300970873786409,
"grad_norm": 1.430366935608713,
"learning_rate": 3.692307692307693e-05,
"loss": 0.7105,
"step": 24
},
{
"epoch": 0.24271844660194175,
"grad_norm": 1.682581513255261,
"learning_rate": 3.846153846153846e-05,
"loss": 0.6988,
"step": 25
},
{
"epoch": 0.2524271844660194,
"grad_norm": 1.706823703289166,
"learning_rate": 4e-05,
"loss": 0.6991,
"step": 26
},
{
"epoch": 0.2621359223300971,
"grad_norm": 2.014798562695819,
"learning_rate": 4.1538461538461544e-05,
"loss": 0.6955,
"step": 27
},
{
"epoch": 0.27184466019417475,
"grad_norm": 1.4584117807771175,
"learning_rate": 4.307692307692308e-05,
"loss": 0.6901,
"step": 28
},
{
"epoch": 0.2815533980582524,
"grad_norm": 2.5240991865834883,
"learning_rate": 4.461538461538462e-05,
"loss": 0.6901,
"step": 29
},
{
"epoch": 0.2912621359223301,
"grad_norm": 1.7349683843216974,
"learning_rate": 4.615384615384615e-05,
"loss": 0.6912,
"step": 30
},
{
"epoch": 0.30097087378640774,
"grad_norm": 2.453207741393019,
"learning_rate": 4.76923076923077e-05,
"loss": 0.6837,
"step": 31
},
{
"epoch": 0.3106796116504854,
"grad_norm": 2.1891597642473517,
"learning_rate": 4.923076923076924e-05,
"loss": 0.6834,
"step": 32
},
{
"epoch": 0.32038834951456313,
"grad_norm": 2.0789856106085867,
"learning_rate": 5.076923076923077e-05,
"loss": 0.6854,
"step": 33
},
{
"epoch": 0.3300970873786408,
"grad_norm": 1.7347914028228881,
"learning_rate": 5.230769230769231e-05,
"loss": 0.6734,
"step": 34
},
{
"epoch": 0.33980582524271846,
"grad_norm": 2.1284302056196744,
"learning_rate": 5.3846153846153853e-05,
"loss": 0.6741,
"step": 35
},
{
"epoch": 0.34951456310679613,
"grad_norm": 1.9299657090664841,
"learning_rate": 5.538461538461539e-05,
"loss": 0.6737,
"step": 36
},
{
"epoch": 0.3592233009708738,
"grad_norm": 2.1216828159935135,
"learning_rate": 5.692307692307693e-05,
"loss": 0.6701,
"step": 37
},
{
"epoch": 0.36893203883495146,
"grad_norm": 1.2362157926807915,
"learning_rate": 5.846153846153846e-05,
"loss": 0.6639,
"step": 38
},
{
"epoch": 0.3786407766990291,
"grad_norm": 2.2344300721881525,
"learning_rate": 6.000000000000001e-05,
"loss": 0.666,
"step": 39
},
{
"epoch": 0.3883495145631068,
"grad_norm": 1.544248418606736,
"learning_rate": 6.153846153846155e-05,
"loss": 0.6656,
"step": 40
},
{
"epoch": 0.39805825242718446,
"grad_norm": 2.774120239479565,
"learning_rate": 6.307692307692308e-05,
"loss": 0.6683,
"step": 41
},
{
"epoch": 0.4077669902912621,
"grad_norm": 2.1993216366589663,
"learning_rate": 6.461538461538462e-05,
"loss": 0.6649,
"step": 42
},
{
"epoch": 0.4174757281553398,
"grad_norm": 1.8997195417294006,
"learning_rate": 6.615384615384616e-05,
"loss": 0.6501,
"step": 43
},
{
"epoch": 0.42718446601941745,
"grad_norm": 1.9017422216012896,
"learning_rate": 6.76923076923077e-05,
"loss": 0.6556,
"step": 44
},
{
"epoch": 0.4368932038834951,
"grad_norm": 2.2115485405167603,
"learning_rate": 6.923076923076924e-05,
"loss": 0.6582,
"step": 45
},
{
"epoch": 0.44660194174757284,
"grad_norm": 2.2558473718778282,
"learning_rate": 7.076923076923078e-05,
"loss": 0.6599,
"step": 46
},
{
"epoch": 0.4563106796116505,
"grad_norm": 1.8236441414000624,
"learning_rate": 7.230769230769232e-05,
"loss": 0.6502,
"step": 47
},
{
"epoch": 0.46601941747572817,
"grad_norm": 1.668233835252892,
"learning_rate": 7.384615384615386e-05,
"loss": 0.6525,
"step": 48
},
{
"epoch": 0.47572815533980584,
"grad_norm": 3.1753217004320975,
"learning_rate": 7.538461538461539e-05,
"loss": 0.6627,
"step": 49
},
{
"epoch": 0.4854368932038835,
"grad_norm": 1.621218570570416,
"learning_rate": 7.692307692307693e-05,
"loss": 0.6484,
"step": 50
},
{
"epoch": 0.49514563106796117,
"grad_norm": 4.339926226583821,
"learning_rate": 7.846153846153847e-05,
"loss": 0.6752,
"step": 51
},
{
"epoch": 0.5048543689320388,
"grad_norm": 3.113087384499717,
"learning_rate": 8e-05,
"loss": 0.6722,
"step": 52
},
{
"epoch": 0.5145631067961165,
"grad_norm": 2.714482442820161,
"learning_rate": 7.999907919834168e-05,
"loss": 0.6628,
"step": 53
},
{
"epoch": 0.5242718446601942,
"grad_norm": 2.914244429607087,
"learning_rate": 7.999631683576055e-05,
"loss": 0.6581,
"step": 54
},
{
"epoch": 0.5339805825242718,
"grad_norm": 1.884493343669805,
"learning_rate": 7.999171303943594e-05,
"loss": 0.6497,
"step": 55
},
{
"epoch": 0.5436893203883495,
"grad_norm": 3.1324813256148873,
"learning_rate": 7.998526802132707e-05,
"loss": 0.6539,
"step": 56
},
{
"epoch": 0.5533980582524272,
"grad_norm": 1.9123959562537083,
"learning_rate": 7.997698207816309e-05,
"loss": 0.6491,
"step": 57
},
{
"epoch": 0.5631067961165048,
"grad_norm": 2.223063366245145,
"learning_rate": 7.99668555914295e-05,
"loss": 0.6515,
"step": 58
},
{
"epoch": 0.5728155339805825,
"grad_norm": 2.268558088911201,
"learning_rate": 7.995488902735063e-05,
"loss": 0.6573,
"step": 59
},
{
"epoch": 0.5825242718446602,
"grad_norm": 1.5722495721856204,
"learning_rate": 7.994108293686804e-05,
"loss": 0.6441,
"step": 60
},
{
"epoch": 0.5922330097087378,
"grad_norm": 1.666894724940502,
"learning_rate": 7.992543795561527e-05,
"loss": 0.6408,
"step": 61
},
{
"epoch": 0.6019417475728155,
"grad_norm": 1.0031568832099003,
"learning_rate": 7.990795480388861e-05,
"loss": 0.6342,
"step": 62
},
{
"epoch": 0.6116504854368932,
"grad_norm": 2.2162360491704067,
"learning_rate": 7.988863428661377e-05,
"loss": 0.6589,
"step": 63
},
{
"epoch": 0.6213592233009708,
"grad_norm": 1.7496212177283834,
"learning_rate": 7.9867477293309e-05,
"loss": 0.6484,
"step": 64
},
{
"epoch": 0.6310679611650486,
"grad_norm": 1.5169651851287749,
"learning_rate": 7.984448479804398e-05,
"loss": 0.6394,
"step": 65
},
{
"epoch": 0.6407766990291263,
"grad_norm": 1.9120046573933336,
"learning_rate": 7.981965785939515e-05,
"loss": 0.635,
"step": 66
},
{
"epoch": 0.6504854368932039,
"grad_norm": 1.7557422251639143,
"learning_rate": 7.97929976203968e-05,
"loss": 0.647,
"step": 67
},
{
"epoch": 0.6601941747572816,
"grad_norm": 1.3556049828157026,
"learning_rate": 7.976450530848851e-05,
"loss": 0.6374,
"step": 68
},
{
"epoch": 0.6699029126213593,
"grad_norm": 2.244626845581171,
"learning_rate": 7.973418223545874e-05,
"loss": 0.6334,
"step": 69
},
{
"epoch": 0.6796116504854369,
"grad_norm": 1.3591820052552963,
"learning_rate": 7.970202979738426e-05,
"loss": 0.6376,
"step": 70
},
{
"epoch": 0.6893203883495146,
"grad_norm": 1.5805931826326196,
"learning_rate": 7.966804947456599e-05,
"loss": 0.632,
"step": 71
},
{
"epoch": 0.6990291262135923,
"grad_norm": 1.334661947485613,
"learning_rate": 7.96322428314608e-05,
"loss": 0.6269,
"step": 72
},
{
"epoch": 0.7087378640776699,
"grad_norm": 1.8903308561526113,
"learning_rate": 7.959461151660952e-05,
"loss": 0.6342,
"step": 73
},
{
"epoch": 0.7184466019417476,
"grad_norm": 1.1038829068258371,
"learning_rate": 7.955515726256101e-05,
"loss": 0.6275,
"step": 74
},
{
"epoch": 0.7281553398058253,
"grad_norm": 1.5525916361050445,
"learning_rate": 7.951388188579237e-05,
"loss": 0.6343,
"step": 75
},
{
"epoch": 0.7378640776699029,
"grad_norm": 1.5390664333547033,
"learning_rate": 7.94707872866254e-05,
"loss": 0.6315,
"step": 76
},
{
"epoch": 0.7475728155339806,
"grad_norm": 1.769589829850563,
"learning_rate": 7.942587544913901e-05,
"loss": 0.6329,
"step": 77
},
{
"epoch": 0.7572815533980582,
"grad_norm": 1.4148162374660378,
"learning_rate": 7.937914844107791e-05,
"loss": 0.6297,
"step": 78
},
{
"epoch": 0.7669902912621359,
"grad_norm": 1.7973463567105696,
"learning_rate": 7.933060841375745e-05,
"loss": 0.627,
"step": 79
},
{
"epoch": 0.7766990291262136,
"grad_norm": 1.184326171503996,
"learning_rate": 7.928025760196447e-05,
"loss": 0.6234,
"step": 80
},
{
"epoch": 0.7864077669902912,
"grad_norm": 1.6486867376699348,
"learning_rate": 7.922809832385456e-05,
"loss": 0.6224,
"step": 81
},
{
"epoch": 0.7961165048543689,
"grad_norm": 1.2312763534767475,
"learning_rate": 7.917413298084519e-05,
"loss": 0.6207,
"step": 82
},
{
"epoch": 0.8058252427184466,
"grad_norm": 1.319948768871293,
"learning_rate": 7.911836405750525e-05,
"loss": 0.618,
"step": 83
},
{
"epoch": 0.8155339805825242,
"grad_norm": 1.2459374631380746,
"learning_rate": 7.906079412144055e-05,
"loss": 0.6215,
"step": 84
},
{
"epoch": 0.8252427184466019,
"grad_norm": 1.2798874980653692,
"learning_rate": 7.900142582317576e-05,
"loss": 0.6172,
"step": 85
},
{
"epoch": 0.8349514563106796,
"grad_norm": 2.093389549775017,
"learning_rate": 7.894026189603225e-05,
"loss": 0.618,
"step": 86
},
{
"epoch": 0.8446601941747572,
"grad_norm": 1.1300093559740532,
"learning_rate": 7.887730515600227e-05,
"loss": 0.6173,
"step": 87
},
{
"epoch": 0.8543689320388349,
"grad_norm": 2.3541947389099094,
"learning_rate": 7.881255850161939e-05,
"loss": 0.6243,
"step": 88
},
{
"epoch": 0.8640776699029126,
"grad_norm": 1.5613220102186438,
"learning_rate": 7.87460249138249e-05,
"loss": 0.623,
"step": 89
},
{
"epoch": 0.8737864077669902,
"grad_norm": 1.9104766159006328,
"learning_rate": 7.867770745583074e-05,
"loss": 0.6241,
"step": 90
},
{
"epoch": 0.883495145631068,
"grad_norm": 1.419392875826999,
"learning_rate": 7.860760927297833e-05,
"loss": 0.62,
"step": 91
},
{
"epoch": 0.8932038834951457,
"grad_norm": 1.3282640676407322,
"learning_rate": 7.853573359259381e-05,
"loss": 0.6166,
"step": 92
},
{
"epoch": 0.9029126213592233,
"grad_norm": 1.3250464130522686,
"learning_rate": 7.846208372383947e-05,
"loss": 0.6179,
"step": 93
},
{
"epoch": 0.912621359223301,
"grad_norm": 1.1377495249178353,
"learning_rate": 7.838666305756138e-05,
"loss": 0.6122,
"step": 94
},
{
"epoch": 0.9223300970873787,
"grad_norm": 1.4646244528960923,
"learning_rate": 7.830947506613324e-05,
"loss": 0.6105,
"step": 95
},
{
"epoch": 0.9320388349514563,
"grad_norm": 1.142509028160319,
"learning_rate": 7.823052330329663e-05,
"loss": 0.611,
"step": 96
},
{
"epoch": 0.941747572815534,
"grad_norm": 2.0274152032009494,
"learning_rate": 7.81498114039972e-05,
"loss": 0.616,
"step": 97
},
{
"epoch": 0.9514563106796117,
"grad_norm": 1.0686288368525192,
"learning_rate": 7.806734308421753e-05,
"loss": 0.6101,
"step": 98
},
{
"epoch": 0.9611650485436893,
"grad_norm": 2.0849531670929626,
"learning_rate": 7.798312214080588e-05,
"loss": 0.6128,
"step": 99
},
{
"epoch": 0.970873786407767,
"grad_norm": 1.723568607059033,
"learning_rate": 7.789715245130148e-05,
"loss": 0.6156,
"step": 100
},
{
"epoch": 0.9805825242718447,
"grad_norm": 1.0991520002736015,
"learning_rate": 7.780943797375594e-05,
"loss": 0.6028,
"step": 101
},
{
"epoch": 0.9902912621359223,
"grad_norm": 1.81475046862089,
"learning_rate": 7.77199827465511e-05,
"loss": 0.6176,
"step": 102
},
{
"epoch": 1.0,
"grad_norm": 1.0995121126222103,
"learning_rate": 7.762879088821302e-05,
"loss": 0.612,
"step": 103
},
{
"epoch": 1.0097087378640777,
"grad_norm": 1.1739346884276352,
"learning_rate": 7.753586659722243e-05,
"loss": 0.5928,
"step": 104
},
{
"epoch": 1.0194174757281553,
"grad_norm": 1.475480407854494,
"learning_rate": 7.74412141518214e-05,
"loss": 0.5989,
"step": 105
},
{
"epoch": 1.029126213592233,
"grad_norm": 1.3719226999649305,
"learning_rate": 7.734483790981636e-05,
"loss": 0.5969,
"step": 106
},
{
"epoch": 1.0388349514563107,
"grad_norm": 1.0726737236370019,
"learning_rate": 7.724674230837747e-05,
"loss": 0.5825,
"step": 107
},
{
"epoch": 1.0485436893203883,
"grad_norm": 1.3362590355075374,
"learning_rate": 7.714693186383437e-05,
"loss": 0.594,
"step": 108
},
{
"epoch": 1.058252427184466,
"grad_norm": 0.820555469002108,
"learning_rate": 7.704541117146819e-05,
"loss": 0.5874,
"step": 109
},
{
"epoch": 1.0679611650485437,
"grad_norm": 1.3323408029024517,
"learning_rate": 7.694218490530004e-05,
"loss": 0.5886,
"step": 110
},
{
"epoch": 1.0776699029126213,
"grad_norm": 1.066797450314409,
"learning_rate": 7.683725781787574e-05,
"loss": 0.5876,
"step": 111
},
{
"epoch": 1.087378640776699,
"grad_norm": 1.245072674676843,
"learning_rate": 7.673063474004715e-05,
"loss": 0.5879,
"step": 112
},
{
"epoch": 1.0970873786407767,
"grad_norm": 1.2819802358790717,
"learning_rate": 7.662232058074957e-05,
"loss": 0.5864,
"step": 113
},
{
"epoch": 1.1067961165048543,
"grad_norm": 1.436129834544921,
"learning_rate": 7.651232032677588e-05,
"loss": 0.5919,
"step": 114
},
{
"epoch": 1.116504854368932,
"grad_norm": 0.8175594211587869,
"learning_rate": 7.640063904254691e-05,
"loss": 0.5817,
"step": 115
},
{
"epoch": 1.1262135922330097,
"grad_norm": 1.687072132305568,
"learning_rate": 7.628728186987824e-05,
"loss": 0.5848,
"step": 116
},
{
"epoch": 1.1359223300970873,
"grad_norm": 1.1326446741047016,
"learning_rate": 7.617225402774348e-05,
"loss": 0.5858,
"step": 117
},
{
"epoch": 1.145631067961165,
"grad_norm": 0.82496802522063,
"learning_rate": 7.605556081203405e-05,
"loss": 0.5812,
"step": 118
},
{
"epoch": 1.1553398058252426,
"grad_norm": 1.250641283128446,
"learning_rate": 7.593720759531526e-05,
"loss": 0.5876,
"step": 119
},
{
"epoch": 1.1650485436893203,
"grad_norm": 1.1921582278913663,
"learning_rate": 7.581719982657903e-05,
"loss": 0.5842,
"step": 120
},
{
"epoch": 1.174757281553398,
"grad_norm": 1.2734225443412055,
"learning_rate": 7.569554303099296e-05,
"loss": 0.579,
"step": 121
},
{
"epoch": 1.1844660194174756,
"grad_norm": 1.1853318091968652,
"learning_rate": 7.557224280964603e-05,
"loss": 0.582,
"step": 122
},
{
"epoch": 1.1941747572815533,
"grad_norm": 1.152376249051554,
"learning_rate": 7.544730483929065e-05,
"loss": 0.5836,
"step": 123
},
{
"epoch": 1.203883495145631,
"grad_norm": 1.3041878863702414,
"learning_rate": 7.532073487208132e-05,
"loss": 0.5761,
"step": 124
},
{
"epoch": 1.2135922330097086,
"grad_norm": 1.3112880078778733,
"learning_rate": 7.519253873530986e-05,
"loss": 0.5836,
"step": 125
},
{
"epoch": 1.2233009708737863,
"grad_norm": 1.3474986040772934,
"learning_rate": 7.5062722331137e-05,
"loss": 0.5801,
"step": 126
},
{
"epoch": 1.233009708737864,
"grad_norm": 0.8129513286807843,
"learning_rate": 7.493129163632076e-05,
"loss": 0.5748,
"step": 127
},
{
"epoch": 1.2427184466019416,
"grad_norm": 1.1010576487307613,
"learning_rate": 7.479825270194124e-05,
"loss": 0.5842,
"step": 128
},
{
"epoch": 1.2524271844660193,
"grad_norm": 1.398672981702769,
"learning_rate": 7.466361165312199e-05,
"loss": 0.5877,
"step": 129
},
{
"epoch": 1.262135922330097,
"grad_norm": 1.2164255288325172,
"learning_rate": 7.452737468874809e-05,
"loss": 0.5826,
"step": 130
},
{
"epoch": 1.2718446601941746,
"grad_norm": 0.7866460429939296,
"learning_rate": 7.438954808118064e-05,
"loss": 0.5778,
"step": 131
},
{
"epoch": 1.2815533980582523,
"grad_norm": 1.2501923242149588,
"learning_rate": 7.425013817596812e-05,
"loss": 0.5797,
"step": 132
},
{
"epoch": 1.29126213592233,
"grad_norm": 1.3225455051065684,
"learning_rate": 7.41091513915541e-05,
"loss": 0.578,
"step": 133
},
{
"epoch": 1.3009708737864076,
"grad_norm": 1.0105911545481185,
"learning_rate": 7.396659421898183e-05,
"loss": 0.5754,
"step": 134
},
{
"epoch": 1.3106796116504853,
"grad_norm": 1.2271731398121772,
"learning_rate": 7.382247322159534e-05,
"loss": 0.5808,
"step": 135
},
{
"epoch": 1.3203883495145632,
"grad_norm": 1.1215866709600937,
"learning_rate": 7.367679503473732e-05,
"loss": 0.5738,
"step": 136
},
{
"epoch": 1.3300970873786409,
"grad_norm": 1.0715712124069652,
"learning_rate": 7.352956636544358e-05,
"loss": 0.5754,
"step": 137
},
{
"epoch": 1.3398058252427185,
"grad_norm": 0.9077358541837017,
"learning_rate": 7.338079399213424e-05,
"loss": 0.5711,
"step": 138
},
{
"epoch": 1.3495145631067962,
"grad_norm": 1.064355488383383,
"learning_rate": 7.32304847643017e-05,
"loss": 0.5698,
"step": 139
},
{
"epoch": 1.3592233009708738,
"grad_norm": 0.9384952345701946,
"learning_rate": 7.30786456021953e-05,
"loss": 0.5739,
"step": 140
},
{
"epoch": 1.3689320388349515,
"grad_norm": 1.1821144606047003,
"learning_rate": 7.292528349650262e-05,
"loss": 0.5701,
"step": 141
},
{
"epoch": 1.3786407766990292,
"grad_norm": 0.97575327357092,
"learning_rate": 7.277040550802776e-05,
"loss": 0.573,
"step": 142
},
{
"epoch": 1.3883495145631068,
"grad_norm": 1.1580935526594842,
"learning_rate": 7.261401876736611e-05,
"loss": 0.5751,
"step": 143
},
{
"epoch": 1.3980582524271845,
"grad_norm": 0.9292779139246362,
"learning_rate": 7.245613047457621e-05,
"loss": 0.5718,
"step": 144
},
{
"epoch": 1.4077669902912622,
"grad_norm": 1.2215840715546447,
"learning_rate": 7.229674789884813e-05,
"loss": 0.5749,
"step": 145
},
{
"epoch": 1.4174757281553398,
"grad_norm": 0.9138289618139581,
"learning_rate": 7.213587837816889e-05,
"loss": 0.5696,
"step": 146
},
{
"epoch": 1.4271844660194175,
"grad_norm": 1.0533170144837214,
"learning_rate": 7.197352931898454e-05,
"loss": 0.5726,
"step": 147
},
{
"epoch": 1.4368932038834952,
"grad_norm": 1.1454322229077762,
"learning_rate": 7.180970819585923e-05,
"loss": 0.5712,
"step": 148
},
{
"epoch": 1.4466019417475728,
"grad_norm": 1.0300828181718542,
"learning_rate": 7.164442255113107e-05,
"loss": 0.5717,
"step": 149
},
{
"epoch": 1.4563106796116505,
"grad_norm": 1.077162592235673,
"learning_rate": 7.147767999456484e-05,
"loss": 0.5785,
"step": 150
},
{
"epoch": 1.4660194174757282,
"grad_norm": 1.2410959167210567,
"learning_rate": 7.130948820300166e-05,
"loss": 0.5771,
"step": 151
},
{
"epoch": 1.4757281553398058,
"grad_norm": 1.2329275408989433,
"learning_rate": 7.113985492000558e-05,
"loss": 0.5683,
"step": 152
},
{
"epoch": 1.4854368932038835,
"grad_norm": 1.0776714897294795,
"learning_rate": 7.0968787955507e-05,
"loss": 0.5716,
"step": 153
},
{
"epoch": 1.4951456310679612,
"grad_norm": 0.8984266405483924,
"learning_rate": 7.079629518544312e-05,
"loss": 0.5726,
"step": 154
},
{
"epoch": 1.5048543689320388,
"grad_norm": 1.1047143317546186,
"learning_rate": 7.062238455139544e-05,
"loss": 0.5754,
"step": 155
},
{
"epoch": 1.5145631067961165,
"grad_norm": 1.3200995574966732,
"learning_rate": 7.044706406022393e-05,
"loss": 0.565,
"step": 156
},
{
"epoch": 1.5242718446601942,
"grad_norm": 0.795738070159204,
"learning_rate": 7.027034178369853e-05,
"loss": 0.57,
"step": 157
},
{
"epoch": 1.5339805825242718,
"grad_norm": 0.9946136430312941,
"learning_rate": 7.009222585812755e-05,
"loss": 0.5733,
"step": 158
},
{
"epoch": 1.5436893203883495,
"grad_norm": 1.376070028063213,
"learning_rate": 6.991272448398291e-05,
"loss": 0.5756,
"step": 159
},
{
"epoch": 1.5533980582524272,
"grad_norm": 0.7388882390316432,
"learning_rate": 6.973184592552283e-05,
"loss": 0.5654,
"step": 160
},
{
"epoch": 1.5631067961165048,
"grad_norm": 1.1905180684222885,
"learning_rate": 6.95495985104111e-05,
"loss": 0.5711,
"step": 161
},
{
"epoch": 1.5728155339805825,
"grad_norm": 0.8020799108353651,
"learning_rate": 6.93659906293338e-05,
"loss": 0.5623,
"step": 162
},
{
"epoch": 1.5825242718446602,
"grad_norm": 1.0231881194650578,
"learning_rate": 6.918103073561304e-05,
"loss": 0.57,
"step": 163
},
{
"epoch": 1.5922330097087378,
"grad_norm": 0.7935997277801292,
"learning_rate": 6.899472734481765e-05,
"loss": 0.5697,
"step": 164
},
{
"epoch": 1.6019417475728155,
"grad_norm": 1.0863918666939771,
"learning_rate": 6.880708903437116e-05,
"loss": 0.5649,
"step": 165
},
{
"epoch": 1.6116504854368932,
"grad_norm": 0.7946977064700003,
"learning_rate": 6.8618124443157e-05,
"loss": 0.5629,
"step": 166
},
{
"epoch": 1.6213592233009708,
"grad_norm": 0.7600587207085567,
"learning_rate": 6.842784227112057e-05,
"loss": 0.5652,
"step": 167
},
{
"epoch": 1.6310679611650487,
"grad_norm": 0.9222151118870435,
"learning_rate": 6.823625127886888e-05,
"loss": 0.5602,
"step": 168
},
{
"epoch": 1.6407766990291264,
"grad_norm": 0.8500423865673382,
"learning_rate": 6.804336028726706e-05,
"loss": 0.5638,
"step": 169
},
{
"epoch": 1.650485436893204,
"grad_norm": 1.009548584428779,
"learning_rate": 6.78491781770324e-05,
"loss": 0.5609,
"step": 170
},
{
"epoch": 1.6601941747572817,
"grad_norm": 0.8596245784399692,
"learning_rate": 6.765371388832531e-05,
"loss": 0.5606,
"step": 171
},
{
"epoch": 1.6699029126213594,
"grad_norm": 1.0991682854870917,
"learning_rate": 6.745697642033791e-05,
"loss": 0.5645,
"step": 172
},
{
"epoch": 1.679611650485437,
"grad_norm": 0.9305228321364903,
"learning_rate": 6.725897483087948e-05,
"loss": 0.5572,
"step": 173
},
{
"epoch": 1.6893203883495147,
"grad_norm": 0.9614358976051515,
"learning_rate": 6.705971823595964e-05,
"loss": 0.5616,
"step": 174
},
{
"epoch": 1.6990291262135924,
"grad_norm": 1.0715923701630468,
"learning_rate": 6.685921580936855e-05,
"loss": 0.5629,
"step": 175
},
{
"epoch": 1.70873786407767,
"grad_norm": 1.1630310321701869,
"learning_rate": 6.665747678225454e-05,
"loss": 0.565,
"step": 176
},
{
"epoch": 1.7184466019417477,
"grad_norm": 0.8590469522925508,
"learning_rate": 6.645451044269916e-05,
"loss": 0.5585,
"step": 177
},
{
"epoch": 1.7281553398058254,
"grad_norm": 0.7340387980261763,
"learning_rate": 6.62503261352895e-05,
"loss": 0.5613,
"step": 178
},
{
"epoch": 1.737864077669903,
"grad_norm": 0.710845727118427,
"learning_rate": 6.6044933260688e-05,
"loss": 0.5562,
"step": 179
},
{
"epoch": 1.7475728155339807,
"grad_norm": 0.8027866048243354,
"learning_rate": 6.583834127519966e-05,
"loss": 0.5616,
"step": 180
},
{
"epoch": 1.7572815533980584,
"grad_norm": 1.0941839867674634,
"learning_rate": 6.563055969033659e-05,
"loss": 0.5591,
"step": 181
},
{
"epoch": 1.766990291262136,
"grad_norm": 1.066378946036087,
"learning_rate": 6.54215980723802e-05,
"loss": 0.5612,
"step": 182
},
{
"epoch": 1.7766990291262137,
"grad_norm": 0.7626588888095535,
"learning_rate": 6.521146604194073e-05,
"loss": 0.5528,
"step": 183
},
{
"epoch": 1.7864077669902914,
"grad_norm": 0.8152238302302431,
"learning_rate": 6.500017327351425e-05,
"loss": 0.559,
"step": 184
},
{
"epoch": 1.796116504854369,
"grad_norm": 1.0106535990003993,
"learning_rate": 6.478772949503735e-05,
"loss": 0.5586,
"step": 185
},
{
"epoch": 1.8058252427184467,
"grad_norm": 1.3373530495717711,
"learning_rate": 6.457414448743922e-05,
"loss": 0.5602,
"step": 186
},
{
"epoch": 1.8155339805825244,
"grad_norm": 0.6183996673247697,
"learning_rate": 6.435942808419129e-05,
"loss": 0.5563,
"step": 187
},
{
"epoch": 1.825242718446602,
"grad_norm": 0.4513128772050125,
"learning_rate": 6.41435901708546e-05,
"loss": 0.5558,
"step": 188
},
{
"epoch": 1.8349514563106797,
"grad_norm": 0.8068976523956712,
"learning_rate": 6.392664068462455e-05,
"loss": 0.5535,
"step": 189
},
{
"epoch": 1.8446601941747574,
"grad_norm": 1.1435206608085113,
"learning_rate": 6.370858961387348e-05,
"loss": 0.5589,
"step": 190
},
{
"epoch": 1.854368932038835,
"grad_norm": 0.9238122145618127,
"learning_rate": 6.348944699769078e-05,
"loss": 0.5521,
"step": 191
},
{
"epoch": 1.8640776699029127,
"grad_norm": 0.7049348477803952,
"learning_rate": 6.326922292542067e-05,
"loss": 0.5533,
"step": 192
},
{
"epoch": 1.8737864077669903,
"grad_norm": 0.4705716067710833,
"learning_rate": 6.304792753619768e-05,
"loss": 0.555,
"step": 193
},
{
"epoch": 1.883495145631068,
"grad_norm": 0.5562958157730256,
"learning_rate": 6.282557101847989e-05,
"loss": 0.554,
"step": 194
},
{
"epoch": 1.8932038834951457,
"grad_norm": 0.7776677254914264,
"learning_rate": 6.260216360957982e-05,
"loss": 0.5554,
"step": 195
},
{
"epoch": 1.9029126213592233,
"grad_norm": 0.9626928256285646,
"learning_rate": 6.237771559519309e-05,
"loss": 0.557,
"step": 196
},
{
"epoch": 1.912621359223301,
"grad_norm": 1.0125011253597305,
"learning_rate": 6.215223730892488e-05,
"loss": 0.5576,
"step": 197
},
{
"epoch": 1.9223300970873787,
"grad_norm": 0.8791857222410507,
"learning_rate": 6.192573913181423e-05,
"loss": 0.5601,
"step": 198
},
{
"epoch": 1.9320388349514563,
"grad_norm": 0.7995737202735174,
"learning_rate": 6.169823149185594e-05,
"loss": 0.558,
"step": 199
},
{
"epoch": 1.941747572815534,
"grad_norm": 1.1194920590127924,
"learning_rate": 6.146972486352062e-05,
"loss": 0.5547,
"step": 200
},
{
"epoch": 1.9514563106796117,
"grad_norm": 0.8287113779997599,
"learning_rate": 6.124022976727246e-05,
"loss": 0.5562,
"step": 201
},
{
"epoch": 1.9611650485436893,
"grad_norm": 0.5720384230610551,
"learning_rate": 6.1009756769084625e-05,
"loss": 0.5549,
"step": 202
},
{
"epoch": 1.970873786407767,
"grad_norm": 0.9412252549729389,
"learning_rate": 6.077831647995312e-05,
"loss": 0.5561,
"step": 203
},
{
"epoch": 1.9805825242718447,
"grad_norm": 1.1651868503900755,
"learning_rate": 6.0545919555408026e-05,
"loss": 0.5549,
"step": 204
},
{
"epoch": 1.9902912621359223,
"grad_norm": 0.6059833274896135,
"learning_rate": 6.0312576695023015e-05,
"loss": 0.5532,
"step": 205
},
{
"epoch": 2.0,
"grad_norm": 0.7944875296787923,
"learning_rate": 6.007829864192274e-05,
"loss": 0.5471,
"step": 206
},
{
"epoch": 2.0097087378640777,
"grad_norm": 0.9861185591611118,
"learning_rate": 5.9843096182288184e-05,
"loss": 0.5254,
"step": 207
},
{
"epoch": 2.0194174757281553,
"grad_norm": 0.9193682360720784,
"learning_rate": 5.960698014486009e-05,
"loss": 0.5228,
"step": 208
},
{
"epoch": 2.029126213592233,
"grad_norm": 0.8942942482878242,
"learning_rate": 5.936996140044041e-05,
"loss": 0.5213,
"step": 209
},
{
"epoch": 2.0388349514563107,
"grad_norm": 0.9820673732498754,
"learning_rate": 5.9132050861391774e-05,
"loss": 0.5281,
"step": 210
},
{
"epoch": 2.0485436893203883,
"grad_norm": 1.124245573357021,
"learning_rate": 5.889325948113513e-05,
"loss": 0.5252,
"step": 211
},
{
"epoch": 2.058252427184466,
"grad_norm": 0.9569857392646174,
"learning_rate": 5.865359825364543e-05,
"loss": 0.5295,
"step": 212
},
{
"epoch": 2.0679611650485437,
"grad_norm": 0.6979418510578903,
"learning_rate": 5.841307821294546e-05,
"loss": 0.5212,
"step": 213
},
{
"epoch": 2.0776699029126213,
"grad_norm": 0.8030912411204089,
"learning_rate": 5.8171710432597824e-05,
"loss": 0.525,
"step": 214
},
{
"epoch": 2.087378640776699,
"grad_norm": 1.0921718953136728,
"learning_rate": 5.792950602519516e-05,
"loss": 0.5249,
"step": 215
},
{
"epoch": 2.0970873786407767,
"grad_norm": 0.6815361141499702,
"learning_rate": 5.768647614184846e-05,
"loss": 0.5256,
"step": 216
},
{
"epoch": 2.1067961165048543,
"grad_norm": 0.5765577764382596,
"learning_rate": 5.744263197167369e-05,
"loss": 0.5196,
"step": 217
},
{
"epoch": 2.116504854368932,
"grad_norm": 0.8497374889157474,
"learning_rate": 5.719798474127668e-05,
"loss": 0.5245,
"step": 218
},
{
"epoch": 2.1262135922330097,
"grad_norm": 0.8473330942862705,
"learning_rate": 5.69525457142362e-05,
"loss": 0.5266,
"step": 219
},
{
"epoch": 2.1359223300970873,
"grad_norm": 0.7383851235625609,
"learning_rate": 5.6706326190585416e-05,
"loss": 0.5241,
"step": 220
},
{
"epoch": 2.145631067961165,
"grad_norm": 0.731625229205727,
"learning_rate": 5.6459337506291594e-05,
"loss": 0.5214,
"step": 221
},
{
"epoch": 2.1553398058252426,
"grad_norm": 0.6257327662890664,
"learning_rate": 5.621159103273424e-05,
"loss": 0.521,
"step": 222
},
{
"epoch": 2.1650485436893203,
"grad_norm": 0.5394134102082522,
"learning_rate": 5.596309817618156e-05,
"loss": 0.5202,
"step": 223
},
{
"epoch": 2.174757281553398,
"grad_norm": 0.5108441929525305,
"learning_rate": 5.571387037726524e-05,
"loss": 0.5243,
"step": 224
},
{
"epoch": 2.1844660194174756,
"grad_norm": 0.6078187548666663,
"learning_rate": 5.5463919110453836e-05,
"loss": 0.5196,
"step": 225
},
{
"epoch": 2.1941747572815533,
"grad_norm": 0.663473200149725,
"learning_rate": 5.521325588352437e-05,
"loss": 0.5194,
"step": 226
},
{
"epoch": 2.203883495145631,
"grad_norm": 0.6429831831672274,
"learning_rate": 5.496189223703262e-05,
"loss": 0.5173,
"step": 227
},
{
"epoch": 2.2135922330097086,
"grad_norm": 0.6220750410182839,
"learning_rate": 5.47098397437817e-05,
"loss": 0.5209,
"step": 228
},
{
"epoch": 2.2233009708737863,
"grad_norm": 0.5886484774766658,
"learning_rate": 5.4457110008289306e-05,
"loss": 0.5155,
"step": 229
},
{
"epoch": 2.233009708737864,
"grad_norm": 0.4437653008171185,
"learning_rate": 5.420371466625339e-05,
"loss": 0.5185,
"step": 230
},
{
"epoch": 2.2427184466019416,
"grad_norm": 0.39870906238532716,
"learning_rate": 5.3949665384016556e-05,
"loss": 0.5217,
"step": 231
},
{
"epoch": 2.2524271844660193,
"grad_norm": 0.5717388200227635,
"learning_rate": 5.369497385802877e-05,
"loss": 0.5196,
"step": 232
},
{
"epoch": 2.262135922330097,
"grad_norm": 0.6577624762016366,
"learning_rate": 5.3439651814309044e-05,
"loss": 0.5182,
"step": 233
},
{
"epoch": 2.2718446601941746,
"grad_norm": 0.6946628514135419,
"learning_rate": 5.3183711007905434e-05,
"loss": 0.5188,
"step": 234
},
{
"epoch": 2.2815533980582523,
"grad_norm": 0.6759552411316988,
"learning_rate": 5.2927163222353876e-05,
"loss": 0.5171,
"step": 235
},
{
"epoch": 2.29126213592233,
"grad_norm": 0.6300902669161529,
"learning_rate": 5.2670020269135703e-05,
"loss": 0.5224,
"step": 236
},
{
"epoch": 2.3009708737864076,
"grad_norm": 0.5185045096628276,
"learning_rate": 5.241229398713379e-05,
"loss": 0.5268,
"step": 237
},
{
"epoch": 2.3106796116504853,
"grad_norm": 0.4449400609212431,
"learning_rate": 5.2153996242087544e-05,
"loss": 0.5207,
"step": 238
},
{
"epoch": 2.320388349514563,
"grad_norm": 0.4056127362971999,
"learning_rate": 5.1895138926046553e-05,
"loss": 0.5207,
"step": 239
},
{
"epoch": 2.3300970873786406,
"grad_norm": 0.31876595932444957,
"learning_rate": 5.16357339568231e-05,
"loss": 0.5233,
"step": 240
},
{
"epoch": 2.3398058252427183,
"grad_norm": 0.3826214088490694,
"learning_rate": 5.13757932774435e-05,
"loss": 0.5229,
"step": 241
},
{
"epoch": 2.349514563106796,
"grad_norm": 0.39293509694649387,
"learning_rate": 5.111532885559816e-05,
"loss": 0.517,
"step": 242
},
{
"epoch": 2.3592233009708736,
"grad_norm": 0.3322451238923787,
"learning_rate": 5.08543526830907e-05,
"loss": 0.5183,
"step": 243
},
{
"epoch": 2.3689320388349513,
"grad_norm": 0.35774381456905935,
"learning_rate": 5.05928767752857e-05,
"loss": 0.5207,
"step": 244
},
{
"epoch": 2.378640776699029,
"grad_norm": 0.3490143073371343,
"learning_rate": 5.033091317055565e-05,
"loss": 0.5185,
"step": 245
},
{
"epoch": 2.3883495145631066,
"grad_norm": 0.3488449914720745,
"learning_rate": 5.006847392972664e-05,
"loss": 0.5233,
"step": 246
},
{
"epoch": 2.3980582524271843,
"grad_norm": 0.4080875571104722,
"learning_rate": 4.9805571135523066e-05,
"loss": 0.5196,
"step": 247
},
{
"epoch": 2.407766990291262,
"grad_norm": 0.3959917551246165,
"learning_rate": 4.954221689201138e-05,
"loss": 0.5194,
"step": 248
},
{
"epoch": 2.4174757281553396,
"grad_norm": 0.331330529722134,
"learning_rate": 4.9278423324042776e-05,
"loss": 0.5175,
"step": 249
},
{
"epoch": 2.4271844660194173,
"grad_norm": 0.3809362865456529,
"learning_rate": 4.901420257669501e-05,
"loss": 0.5195,
"step": 250
},
{
"epoch": 2.436893203883495,
"grad_norm": 0.3961884838888781,
"learning_rate": 4.8749566814713204e-05,
"loss": 0.5203,
"step": 251
},
{
"epoch": 2.4466019417475726,
"grad_norm": 0.28927135799547965,
"learning_rate": 4.848452822194977e-05,
"loss": 0.522,
"step": 252
},
{
"epoch": 2.4563106796116507,
"grad_norm": 0.32321223873962246,
"learning_rate": 4.821909900080348e-05,
"loss": 0.5192,
"step": 253
},
{
"epoch": 2.466019417475728,
"grad_norm": 0.3034931174843224,
"learning_rate": 4.7953291371657724e-05,
"loss": 0.5214,
"step": 254
},
{
"epoch": 2.475728155339806,
"grad_norm": 0.2955075395432886,
"learning_rate": 4.768711757231775e-05,
"loss": 0.5197,
"step": 255
},
{
"epoch": 2.4854368932038833,
"grad_norm": 0.33319864095909213,
"learning_rate": 4.742058985744738e-05,
"loss": 0.5199,
"step": 256
},
{
"epoch": 2.4951456310679614,
"grad_norm": 0.3018636044546277,
"learning_rate": 4.715372049800467e-05,
"loss": 0.5234,
"step": 257
},
{
"epoch": 2.5048543689320386,
"grad_norm": 0.28068625121120355,
"learning_rate": 4.688652178067708e-05,
"loss": 0.5223,
"step": 258
},
{
"epoch": 2.5145631067961167,
"grad_norm": 0.3229351457016303,
"learning_rate": 4.661900600731571e-05,
"loss": 0.5233,
"step": 259
},
{
"epoch": 2.524271844660194,
"grad_norm": 0.3762838763816941,
"learning_rate": 4.635118549436895e-05,
"loss": 0.5238,
"step": 260
},
{
"epoch": 2.533980582524272,
"grad_norm": 0.36897391891079456,
"learning_rate": 4.608307257231541e-05,
"loss": 0.5184,
"step": 261
},
{
"epoch": 2.5436893203883493,
"grad_norm": 0.3195793897829173,
"learning_rate": 4.5814679585096265e-05,
"loss": 0.5169,
"step": 262
},
{
"epoch": 2.5533980582524274,
"grad_norm": 0.2773810615279961,
"learning_rate": 4.5546018889546876e-05,
"loss": 0.5168,
"step": 263
},
{
"epoch": 2.5631067961165046,
"grad_norm": 0.3441559401937381,
"learning_rate": 4.527710285482799e-05,
"loss": 0.517,
"step": 264
},
{
"epoch": 2.5728155339805827,
"grad_norm": 0.37273218540100866,
"learning_rate": 4.500794386185609e-05,
"loss": 0.5185,
"step": 265
},
{
"epoch": 2.58252427184466,
"grad_norm": 0.26503820958744123,
"learning_rate": 4.473855430273355e-05,
"loss": 0.5164,
"step": 266
},
{
"epoch": 2.592233009708738,
"grad_norm": 0.2934088823294493,
"learning_rate": 4.4468946580178026e-05,
"loss": 0.5127,
"step": 267
},
{
"epoch": 2.6019417475728153,
"grad_norm": 0.3186295434544236,
"learning_rate": 4.4199133106951407e-05,
"loss": 0.5173,
"step": 268
},
{
"epoch": 2.6116504854368934,
"grad_norm": 0.3309244613515348,
"learning_rate": 4.3929126305288364e-05,
"loss": 0.5229,
"step": 269
},
{
"epoch": 2.6213592233009706,
"grad_norm": 0.26814510063287106,
"learning_rate": 4.365893860632444e-05,
"loss": 0.5167,
"step": 270
},
{
"epoch": 2.6310679611650487,
"grad_norm": 0.3074091286659034,
"learning_rate": 4.338858244952369e-05,
"loss": 0.5156,
"step": 271
},
{
"epoch": 2.6407766990291264,
"grad_norm": 0.3823340679989687,
"learning_rate": 4.3118070282106e-05,
"loss": 0.5168,
"step": 272
},
{
"epoch": 2.650485436893204,
"grad_norm": 0.47904147679754805,
"learning_rate": 4.2847414558473987e-05,
"loss": 0.5184,
"step": 273
},
{
"epoch": 2.6601941747572817,
"grad_norm": 0.4269268816899063,
"learning_rate": 4.257662773963961e-05,
"loss": 0.5173,
"step": 274
},
{
"epoch": 2.6699029126213594,
"grad_norm": 0.3094464875254195,
"learning_rate": 4.230572229265045e-05,
"loss": 0.5142,
"step": 275
},
{
"epoch": 2.679611650485437,
"grad_norm": 0.31791889845655724,
"learning_rate": 4.2034710690015766e-05,
"loss": 0.517,
"step": 276
},
{
"epoch": 2.6893203883495147,
"grad_norm": 0.4064644387432894,
"learning_rate": 4.17636054091322e-05,
"loss": 0.516,
"step": 277
},
{
"epoch": 2.6990291262135924,
"grad_norm": 0.3053310406953197,
"learning_rate": 4.1492418931709366e-05,
"loss": 0.5175,
"step": 278
},
{
"epoch": 2.70873786407767,
"grad_norm": 0.2765928946593284,
"learning_rate": 4.1221163743195175e-05,
"loss": 0.5185,
"step": 279
},
{
"epoch": 2.7184466019417477,
"grad_norm": 0.3378318482897848,
"learning_rate": 4.094985233220098e-05,
"loss": 0.5211,
"step": 280
},
{
"epoch": 2.7281553398058254,
"grad_norm": 0.3143734127353884,
"learning_rate": 4.067849718992665e-05,
"loss": 0.5197,
"step": 281
},
{
"epoch": 2.737864077669903,
"grad_norm": 0.259820524121846,
"learning_rate": 4.040711080958547e-05,
"loss": 0.5259,
"step": 282
},
{
"epoch": 2.7475728155339807,
"grad_norm": 0.32611605738060934,
"learning_rate": 4.013570568582883e-05,
"loss": 0.5174,
"step": 283
},
{
"epoch": 2.7572815533980584,
"grad_norm": 0.29760137014246574,
"learning_rate": 3.986429431417118e-05,
"loss": 0.5124,
"step": 284
},
{
"epoch": 2.766990291262136,
"grad_norm": 0.26849482707048117,
"learning_rate": 3.959288919041455e-05,
"loss": 0.5116,
"step": 285
},
{
"epoch": 2.7766990291262137,
"grad_norm": 0.28358089368587186,
"learning_rate": 3.9321502810073354e-05,
"loss": 0.5179,
"step": 286
},
{
"epoch": 2.7864077669902914,
"grad_norm": 0.3098687865760963,
"learning_rate": 3.905014766779904e-05,
"loss": 0.5148,
"step": 287
},
{
"epoch": 2.796116504854369,
"grad_norm": 0.3018433729775123,
"learning_rate": 3.8778836256804845e-05,
"loss": 0.5165,
"step": 288
},
{
"epoch": 2.8058252427184467,
"grad_norm": 0.23893414660568268,
"learning_rate": 3.850758106829065e-05,
"loss": 0.5161,
"step": 289
},
{
"epoch": 2.8155339805825244,
"grad_norm": 0.2524448827230724,
"learning_rate": 3.823639459086781e-05,
"loss": 0.5152,
"step": 290
},
{
"epoch": 2.825242718446602,
"grad_norm": 0.2770033187618851,
"learning_rate": 3.796528930998425e-05,
"loss": 0.5145,
"step": 291
},
{
"epoch": 2.8349514563106797,
"grad_norm": 0.25224488768181713,
"learning_rate": 3.769427770734955e-05,
"loss": 0.5195,
"step": 292
},
{
"epoch": 2.8446601941747574,
"grad_norm": 0.27486298363736394,
"learning_rate": 3.742337226036041e-05,
"loss": 0.5157,
"step": 293
},
{
"epoch": 2.854368932038835,
"grad_norm": 0.23960082819762807,
"learning_rate": 3.715258544152603e-05,
"loss": 0.5154,
"step": 294
},
{
"epoch": 2.8640776699029127,
"grad_norm": 0.2710779329439077,
"learning_rate": 3.688192971789401e-05,
"loss": 0.5199,
"step": 295
},
{
"epoch": 2.8737864077669903,
"grad_norm": 0.256469645800299,
"learning_rate": 3.6611417550476324e-05,
"loss": 0.5149,
"step": 296
},
{
"epoch": 2.883495145631068,
"grad_norm": 0.3010842522468919,
"learning_rate": 3.6341061393675574e-05,
"loss": 0.5188,
"step": 297
},
{
"epoch": 2.8932038834951457,
"grad_norm": 0.24615248139015927,
"learning_rate": 3.607087369471164e-05,
"loss": 0.5142,
"step": 298
},
{
"epoch": 2.9029126213592233,
"grad_norm": 0.3025647213003904,
"learning_rate": 3.580086689304861e-05,
"loss": 0.5188,
"step": 299
},
{
"epoch": 2.912621359223301,
"grad_norm": 0.2506413395712758,
"learning_rate": 3.553105341982198e-05,
"loss": 0.5127,
"step": 300
},
{
"epoch": 2.9223300970873787,
"grad_norm": 0.26488744144075266,
"learning_rate": 3.526144569726647e-05,
"loss": 0.5148,
"step": 301
},
{
"epoch": 2.9320388349514563,
"grad_norm": 0.23180961021643326,
"learning_rate": 3.499205613814393e-05,
"loss": 0.518,
"step": 302
},
{
"epoch": 2.941747572815534,
"grad_norm": 0.24614944764274665,
"learning_rate": 3.472289714517203e-05,
"loss": 0.5119,
"step": 303
},
{
"epoch": 2.9514563106796117,
"grad_norm": 0.22977263011082621,
"learning_rate": 3.445398111045313e-05,
"loss": 0.5184,
"step": 304
},
{
"epoch": 2.9611650485436893,
"grad_norm": 0.21286238167178875,
"learning_rate": 3.418532041490375e-05,
"loss": 0.517,
"step": 305
},
{
"epoch": 2.970873786407767,
"grad_norm": 0.2548130433665424,
"learning_rate": 3.3916927427684595e-05,
"loss": 0.5186,
"step": 306
},
{
"epoch": 2.9805825242718447,
"grad_norm": 0.31989149824666413,
"learning_rate": 3.364881450563106e-05,
"loss": 0.5153,
"step": 307
},
{
"epoch": 2.9902912621359223,
"grad_norm": 0.2762238599236643,
"learning_rate": 3.338099399268429e-05,
"loss": 0.5167,
"step": 308
},
{
"epoch": 3.0,
"grad_norm": 0.20481446931191682,
"learning_rate": 3.311347821932292e-05,
"loss": 0.5092,
"step": 309
},
{
"epoch": 3.0097087378640777,
"grad_norm": 0.22631177358316332,
"learning_rate": 3.284627950199535e-05,
"loss": 0.4909,
"step": 310
},
{
"epoch": 3.0194174757281553,
"grad_norm": 0.21905855651481868,
"learning_rate": 3.2579410142552646e-05,
"loss": 0.4889,
"step": 311
},
{
"epoch": 3.029126213592233,
"grad_norm": 0.268856232171971,
"learning_rate": 3.231288242768226e-05,
"loss": 0.4882,
"step": 312
},
{
"epoch": 3.0388349514563107,
"grad_norm": 0.2631480748788351,
"learning_rate": 3.204670862834228e-05,
"loss": 0.4822,
"step": 313
},
{
"epoch": 3.0485436893203883,
"grad_norm": 0.28121213579534965,
"learning_rate": 3.178090099919653e-05,
"loss": 0.4848,
"step": 314
},
{
"epoch": 3.058252427184466,
"grad_norm": 0.2553829402200111,
"learning_rate": 3.1515471778050246e-05,
"loss": 0.4853,
"step": 315
},
{
"epoch": 3.0679611650485437,
"grad_norm": 0.29584988541778207,
"learning_rate": 3.12504331852868e-05,
"loss": 0.4835,
"step": 316
},
{
"epoch": 3.0776699029126213,
"grad_norm": 0.21912251520340542,
"learning_rate": 3.098579742330499e-05,
"loss": 0.4866,
"step": 317
},
{
"epoch": 3.087378640776699,
"grad_norm": 0.2816186665047795,
"learning_rate": 3.0721576675957224e-05,
"loss": 0.4856,
"step": 318
},
{
"epoch": 3.0970873786407767,
"grad_norm": 0.24175134230414194,
"learning_rate": 3.0457783107988642e-05,
"loss": 0.4886,
"step": 319
},
{
"epoch": 3.1067961165048543,
"grad_norm": 0.22350452741221052,
"learning_rate": 3.0194428864476947e-05,
"loss": 0.4836,
"step": 320
},
{
"epoch": 3.116504854368932,
"grad_norm": 0.25073180546811125,
"learning_rate": 2.9931526070273374e-05,
"loss": 0.4884,
"step": 321
},
{
"epoch": 3.1262135922330097,
"grad_norm": 0.19079613465758094,
"learning_rate": 2.9669086829444364e-05,
"loss": 0.485,
"step": 322
},
{
"epoch": 3.1359223300970873,
"grad_norm": 0.23960288437553956,
"learning_rate": 2.9407123224714312e-05,
"loss": 0.4856,
"step": 323
},
{
"epoch": 3.145631067961165,
"grad_norm": 0.22627618578886288,
"learning_rate": 2.9145647316909306e-05,
"loss": 0.4862,
"step": 324
},
{
"epoch": 3.1553398058252426,
"grad_norm": 0.1960335183955326,
"learning_rate": 2.8884671144401833e-05,
"loss": 0.4869,
"step": 325
},
{
"epoch": 3.1650485436893203,
"grad_norm": 0.2079951634137142,
"learning_rate": 2.8624206722556508e-05,
"loss": 0.491,
"step": 326
},
{
"epoch": 3.174757281553398,
"grad_norm": 0.18132268629371445,
"learning_rate": 2.8364266043176897e-05,
"loss": 0.4876,
"step": 327
},
{
"epoch": 3.1844660194174756,
"grad_norm": 0.18068511996455502,
"learning_rate": 2.810486107395347e-05,
"loss": 0.4858,
"step": 328
},
{
"epoch": 3.1941747572815533,
"grad_norm": 0.19264695463261375,
"learning_rate": 2.7846003757912473e-05,
"loss": 0.4891,
"step": 329
},
{
"epoch": 3.203883495145631,
"grad_norm": 0.16897146599425877,
"learning_rate": 2.7587706012866227e-05,
"loss": 0.4868,
"step": 330
},
{
"epoch": 3.2135922330097086,
"grad_norm": 0.2264234031229311,
"learning_rate": 2.7329979730864313e-05,
"loss": 0.4844,
"step": 331
},
{
"epoch": 3.2233009708737863,
"grad_norm": 0.2433703396011944,
"learning_rate": 2.707283677764613e-05,
"loss": 0.4855,
"step": 332
},
{
"epoch": 3.233009708737864,
"grad_norm": 0.1780724623798919,
"learning_rate": 2.6816288992094573e-05,
"loss": 0.4853,
"step": 333
},
{
"epoch": 3.2427184466019416,
"grad_norm": 0.2550763857085153,
"learning_rate": 2.6560348185690956e-05,
"loss": 0.4865,
"step": 334
},
{
"epoch": 3.2524271844660193,
"grad_norm": 0.18943111426329542,
"learning_rate": 2.6305026141971227e-05,
"loss": 0.4869,
"step": 335
},
{
"epoch": 3.262135922330097,
"grad_norm": 0.21284295474198855,
"learning_rate": 2.6050334615983467e-05,
"loss": 0.4872,
"step": 336
},
{
"epoch": 3.2718446601941746,
"grad_norm": 0.22131826031463572,
"learning_rate": 2.5796285333746615e-05,
"loss": 0.4816,
"step": 337
},
{
"epoch": 3.2815533980582523,
"grad_norm": 0.182135484368646,
"learning_rate": 2.554288999171072e-05,
"loss": 0.4915,
"step": 338
},
{
"epoch": 3.29126213592233,
"grad_norm": 0.23761469952772257,
"learning_rate": 2.5290160256218313e-05,
"loss": 0.4853,
"step": 339
},
{
"epoch": 3.3009708737864076,
"grad_norm": 0.1840898568254873,
"learning_rate": 2.5038107762967393e-05,
"loss": 0.4883,
"step": 340
},
{
"epoch": 3.3106796116504853,
"grad_norm": 0.21359795567551282,
"learning_rate": 2.4786744116475638e-05,
"loss": 0.4871,
"step": 341
},
{
"epoch": 3.320388349514563,
"grad_norm": 0.24890926664546134,
"learning_rate": 2.4536080889546177e-05,
"loss": 0.489,
"step": 342
},
{
"epoch": 3.3300970873786406,
"grad_norm": 0.20782018056580667,
"learning_rate": 2.4286129622734764e-05,
"loss": 0.4844,
"step": 343
},
{
"epoch": 3.3398058252427183,
"grad_norm": 0.21924373313456091,
"learning_rate": 2.4036901823818454e-05,
"loss": 0.4863,
"step": 344
},
{
"epoch": 3.349514563106796,
"grad_norm": 0.19867052689304365,
"learning_rate": 2.378840896726577e-05,
"loss": 0.4908,
"step": 345
},
{
"epoch": 3.3592233009708736,
"grad_norm": 0.2327076797489768,
"learning_rate": 2.3540662493708423e-05,
"loss": 0.4869,
"step": 346
},
{
"epoch": 3.3689320388349513,
"grad_norm": 0.17819481518798666,
"learning_rate": 2.3293673809414598e-05,
"loss": 0.4816,
"step": 347
},
{
"epoch": 3.378640776699029,
"grad_norm": 0.17940622231016476,
"learning_rate": 2.3047454285763793e-05,
"loss": 0.4874,
"step": 348
},
{
"epoch": 3.3883495145631066,
"grad_norm": 0.16039118497959268,
"learning_rate": 2.2802015258723324e-05,
"loss": 0.4869,
"step": 349
},
{
"epoch": 3.3980582524271843,
"grad_norm": 0.17049865419413107,
"learning_rate": 2.2557368028326324e-05,
"loss": 0.4845,
"step": 350
},
{
"epoch": 3.407766990291262,
"grad_norm": 0.18902552117069968,
"learning_rate": 2.2313523858151554e-05,
"loss": 0.4872,
"step": 351
},
{
"epoch": 3.4174757281553396,
"grad_norm": 0.15937546411881842,
"learning_rate": 2.207049397480485e-05,
"loss": 0.4845,
"step": 352
},
{
"epoch": 3.4271844660194173,
"grad_norm": 0.19899647930250075,
"learning_rate": 2.1828289567402173e-05,
"loss": 0.4874,
"step": 353
},
{
"epoch": 3.436893203883495,
"grad_norm": 0.17339320655418355,
"learning_rate": 2.1586921787054564e-05,
"loss": 0.4871,
"step": 354
},
{
"epoch": 3.4466019417475726,
"grad_norm": 0.17543747218166306,
"learning_rate": 2.1346401746354576e-05,
"loss": 0.4828,
"step": 355
},
{
"epoch": 3.4563106796116507,
"grad_norm": 0.17798441146417476,
"learning_rate": 2.110674051886488e-05,
"loss": 0.4838,
"step": 356
},
{
"epoch": 3.466019417475728,
"grad_norm": 0.16029275511201987,
"learning_rate": 2.0867949138608242e-05,
"loss": 0.4841,
"step": 357
},
{
"epoch": 3.475728155339806,
"grad_norm": 0.1663439154413797,
"learning_rate": 2.06300385995596e-05,
"loss": 0.4865,
"step": 358
},
{
"epoch": 3.4854368932038833,
"grad_norm": 0.16712634012536365,
"learning_rate": 2.0393019855139915e-05,
"loss": 0.4833,
"step": 359
},
{
"epoch": 3.4951456310679614,
"grad_norm": 0.17251097817233577,
"learning_rate": 2.0156903817711812e-05,
"loss": 0.485,
"step": 360
},
{
"epoch": 3.5048543689320386,
"grad_norm": 0.17262950471482985,
"learning_rate": 1.9921701358077265e-05,
"loss": 0.4846,
"step": 361
},
{
"epoch": 3.5145631067961167,
"grad_norm": 0.17329043714528009,
"learning_rate": 1.9687423304976994e-05,
"loss": 0.4835,
"step": 362
},
{
"epoch": 3.524271844660194,
"grad_norm": 0.16219535331159693,
"learning_rate": 1.9454080444591998e-05,
"loss": 0.4849,
"step": 363
},
{
"epoch": 3.533980582524272,
"grad_norm": 0.16515501782595504,
"learning_rate": 1.9221683520046892e-05,
"loss": 0.4857,
"step": 364
},
{
"epoch": 3.5436893203883493,
"grad_norm": 0.19734035971441835,
"learning_rate": 1.899024323091539e-05,
"loss": 0.4836,
"step": 365
},
{
"epoch": 3.5533980582524274,
"grad_norm": 0.16333328213007223,
"learning_rate": 1.875977023272757e-05,
"loss": 0.485,
"step": 366
},
{
"epoch": 3.5631067961165046,
"grad_norm": 0.18228849361538985,
"learning_rate": 1.853027513647937e-05,
"loss": 0.4869,
"step": 367
},
{
"epoch": 3.5728155339805827,
"grad_norm": 0.14382298808613353,
"learning_rate": 1.8301768508144078e-05,
"loss": 0.4837,
"step": 368
},
{
"epoch": 3.58252427184466,
"grad_norm": 0.1436393142572651,
"learning_rate": 1.8074260868185784e-05,
"loss": 0.4828,
"step": 369
},
{
"epoch": 3.592233009708738,
"grad_norm": 0.16780032690913915,
"learning_rate": 1.7847762691075115e-05,
"loss": 0.487,
"step": 370
},
{
"epoch": 3.6019417475728153,
"grad_norm": 0.14947935740806928,
"learning_rate": 1.762228440480692e-05,
"loss": 0.4845,
"step": 371
},
{
"epoch": 3.6116504854368934,
"grad_norm": 0.14559731703552367,
"learning_rate": 1.7397836390420192e-05,
"loss": 0.4836,
"step": 372
},
{
"epoch": 3.6213592233009706,
"grad_norm": 0.1506762657862038,
"learning_rate": 1.717442898152012e-05,
"loss": 0.4855,
"step": 373
},
{
"epoch": 3.6310679611650487,
"grad_norm": 0.13248764059764206,
"learning_rate": 1.6952072463802326e-05,
"loss": 0.4848,
"step": 374
},
{
"epoch": 3.6407766990291264,
"grad_norm": 0.14524262090031537,
"learning_rate": 1.6730777074579346e-05,
"loss": 0.4841,
"step": 375
},
{
"epoch": 3.650485436893204,
"grad_norm": 0.13279946211767724,
"learning_rate": 1.651055300230922e-05,
"loss": 0.4847,
"step": 376
},
{
"epoch": 3.6601941747572817,
"grad_norm": 0.14766932876767538,
"learning_rate": 1.6291410386126524e-05,
"loss": 0.4854,
"step": 377
},
{
"epoch": 3.6699029126213594,
"grad_norm": 0.14421279285412036,
"learning_rate": 1.607335931537547e-05,
"loss": 0.4834,
"step": 378
},
{
"epoch": 3.679611650485437,
"grad_norm": 0.1332433537811279,
"learning_rate": 1.585640982914541e-05,
"loss": 0.4881,
"step": 379
},
{
"epoch": 3.6893203883495147,
"grad_norm": 0.13964291558360487,
"learning_rate": 1.564057191580873e-05,
"loss": 0.4812,
"step": 380
},
{
"epoch": 3.6990291262135924,
"grad_norm": 0.13379924755179787,
"learning_rate": 1.54258555125608e-05,
"loss": 0.4865,
"step": 381
},
{
"epoch": 3.70873786407767,
"grad_norm": 0.13942795631282168,
"learning_rate": 1.521227050496266e-05,
"loss": 0.4862,
"step": 382
},
{
"epoch": 3.7184466019417477,
"grad_norm": 0.14829905133246946,
"learning_rate": 1.4999826726485754e-05,
"loss": 0.4841,
"step": 383
},
{
"epoch": 3.7281553398058254,
"grad_norm": 0.13774937243212151,
"learning_rate": 1.4788533958059281e-05,
"loss": 0.4873,
"step": 384
},
{
"epoch": 3.737864077669903,
"grad_norm": 0.14632219044552267,
"learning_rate": 1.457840192761979e-05,
"loss": 0.4854,
"step": 385
},
{
"epoch": 3.7475728155339807,
"grad_norm": 0.15302510684664683,
"learning_rate": 1.4369440309663412e-05,
"loss": 0.4833,
"step": 386
},
{
"epoch": 3.7572815533980584,
"grad_norm": 0.14352691900463502,
"learning_rate": 1.4161658724800357e-05,
"loss": 0.4846,
"step": 387
},
{
"epoch": 3.766990291262136,
"grad_norm": 0.1517023916720968,
"learning_rate": 1.3955066739312e-05,
"loss": 0.4867,
"step": 388
},
{
"epoch": 3.7766990291262137,
"grad_norm": 0.15194309330245784,
"learning_rate": 1.3749673864710524e-05,
"loss": 0.4865,
"step": 389
},
{
"epoch": 3.7864077669902914,
"grad_norm": 0.1468613357237534,
"learning_rate": 1.3545489557300853e-05,
"loss": 0.4846,
"step": 390
},
{
"epoch": 3.796116504854369,
"grad_norm": 0.16395511168934657,
"learning_rate": 1.3342523217745473e-05,
"loss": 0.4869,
"step": 391
},
{
"epoch": 3.8058252427184467,
"grad_norm": 0.15078193434327586,
"learning_rate": 1.3140784190631459e-05,
"loss": 0.4825,
"step": 392
},
{
"epoch": 3.8155339805825244,
"grad_norm": 0.14510817920359698,
"learning_rate": 1.2940281764040368e-05,
"loss": 0.4825,
"step": 393
},
{
"epoch": 3.825242718446602,
"grad_norm": 0.17265976668387825,
"learning_rate": 1.2741025169120539e-05,
"loss": 0.4872,
"step": 394
},
{
"epoch": 3.8349514563106797,
"grad_norm": 0.14620030187493488,
"learning_rate": 1.2543023579662106e-05,
"loss": 0.4845,
"step": 395
},
{
"epoch": 3.8446601941747574,
"grad_norm": 0.1418045201790065,
"learning_rate": 1.234628611167469e-05,
"loss": 0.4845,
"step": 396
},
{
"epoch": 3.854368932038835,
"grad_norm": 0.14833568549734966,
"learning_rate": 1.2150821822967611e-05,
"loss": 0.4882,
"step": 397
},
{
"epoch": 3.8640776699029127,
"grad_norm": 0.147355117137231,
"learning_rate": 1.1956639712732958e-05,
"loss": 0.4845,
"step": 398
},
{
"epoch": 3.8737864077669903,
"grad_norm": 0.13630141790437844,
"learning_rate": 1.1763748721131142e-05,
"loss": 0.4819,
"step": 399
},
{
"epoch": 3.883495145631068,
"grad_norm": 0.13485086713221084,
"learning_rate": 1.1572157728879444e-05,
"loss": 0.485,
"step": 400
},
{
"epoch": 3.8932038834951457,
"grad_norm": 0.14115351641144186,
"learning_rate": 1.1381875556843007e-05,
"loss": 0.4874,
"step": 401
},
{
"epoch": 3.9029126213592233,
"grad_norm": 0.13251154708545737,
"learning_rate": 1.119291096562884e-05,
"loss": 0.4879,
"step": 402
},
{
"epoch": 3.912621359223301,
"grad_norm": 0.14538526251672798,
"learning_rate": 1.1005272655182378e-05,
"loss": 0.487,
"step": 403
},
{
"epoch": 3.9223300970873787,
"grad_norm": 0.14640871835491487,
"learning_rate": 1.0818969264386973e-05,
"loss": 0.4843,
"step": 404
},
{
"epoch": 3.9320388349514563,
"grad_norm": 0.13392456360817495,
"learning_rate": 1.0634009370666214e-05,
"loss": 0.4826,
"step": 405
},
{
"epoch": 3.941747572815534,
"grad_norm": 0.15291675444361674,
"learning_rate": 1.045040148958893e-05,
"loss": 0.488,
"step": 406
},
{
"epoch": 3.9514563106796117,
"grad_norm": 0.13901859144613157,
"learning_rate": 1.0268154074477188e-05,
"loss": 0.4853,
"step": 407
},
{
"epoch": 3.9611650485436893,
"grad_norm": 0.13797907262373485,
"learning_rate": 1.0087275516017083e-05,
"loss": 0.4834,
"step": 408
},
{
"epoch": 3.970873786407767,
"grad_norm": 0.13762327771479307,
"learning_rate": 9.907774141872468e-06,
"loss": 0.488,
"step": 409
},
{
"epoch": 3.9805825242718447,
"grad_norm": 0.14986633803246888,
"learning_rate": 9.729658216301479e-06,
"loss": 0.4828,
"step": 410
},
{
"epoch": 3.9902912621359223,
"grad_norm": 0.13435313597563736,
"learning_rate": 9.552935939776083e-06,
"loss": 0.4871,
"step": 411
},
{
"epoch": 4.0,
"grad_norm": 0.14759467236298449,
"learning_rate": 9.377615448604574e-06,
"loss": 0.4831,
"step": 412
},
{
"epoch": 4.009708737864078,
"grad_norm": 0.17306287750975244,
"learning_rate": 9.203704814556871e-06,
"loss": 0.4669,
"step": 413
},
{
"epoch": 4.019417475728155,
"grad_norm": 0.14023047375285927,
"learning_rate": 9.031212044493016e-06,
"loss": 0.4659,
"step": 414
},
{
"epoch": 4.029126213592233,
"grad_norm": 0.14153603129051684,
"learning_rate": 8.860145079994433e-06,
"loss": 0.4656,
"step": 415
},
{
"epoch": 4.038834951456311,
"grad_norm": 0.14948734290428067,
"learning_rate": 8.690511796998344e-06,
"loss": 0.4653,
"step": 416
},
{
"epoch": 4.048543689320389,
"grad_norm": 0.16197544725648444,
"learning_rate": 8.522320005435162e-06,
"loss": 0.4686,
"step": 417
},
{
"epoch": 4.058252427184466,
"grad_norm": 0.16013381839658847,
"learning_rate": 8.355577448868933e-06,
"loss": 0.4709,
"step": 418
},
{
"epoch": 4.067961165048544,
"grad_norm": 0.13999136843500865,
"learning_rate": 8.190291804140775e-06,
"loss": 0.4648,
"step": 419
},
{
"epoch": 4.077669902912621,
"grad_norm": 0.1470240140496139,
"learning_rate": 8.02647068101547e-06,
"loss": 0.4681,
"step": 420
},
{
"epoch": 4.087378640776699,
"grad_norm": 0.1355700665658678,
"learning_rate": 7.864121621831126e-06,
"loss": 0.467,
"step": 421
},
{
"epoch": 4.097087378640777,
"grad_norm": 0.14144514644659487,
"learning_rate": 7.703252101151873e-06,
"loss": 0.4613,
"step": 422
},
{
"epoch": 4.106796116504855,
"grad_norm": 0.129930341121314,
"learning_rate": 7.5438695254238e-06,
"loss": 0.4638,
"step": 423
},
{
"epoch": 4.116504854368932,
"grad_norm": 0.14218321035952994,
"learning_rate": 7.385981232633894e-06,
"loss": 0.465,
"step": 424
},
{
"epoch": 4.12621359223301,
"grad_norm": 0.1420896044955223,
"learning_rate": 7.229594491972256e-06,
"loss": 0.4659,
"step": 425
},
{
"epoch": 4.135922330097087,
"grad_norm": 0.13076263084557904,
"learning_rate": 7.07471650349739e-06,
"loss": 0.4648,
"step": 426
},
{
"epoch": 4.145631067961165,
"grad_norm": 0.13079701173247169,
"learning_rate": 6.921354397804712e-06,
"loss": 0.4651,
"step": 427
},
{
"epoch": 4.155339805825243,
"grad_norm": 0.1299754901865967,
"learning_rate": 6.7695152356983054e-06,
"loss": 0.4647,
"step": 428
},
{
"epoch": 4.165048543689321,
"grad_norm": 0.12764332296015807,
"learning_rate": 6.619206007865768e-06,
"loss": 0.4607,
"step": 429
},
{
"epoch": 4.174757281553398,
"grad_norm": 0.12376699505437842,
"learning_rate": 6.47043363455643e-06,
"loss": 0.4674,
"step": 430
},
{
"epoch": 4.184466019417476,
"grad_norm": 0.11650682736889516,
"learning_rate": 6.323204965262686e-06,
"loss": 0.463,
"step": 431
},
{
"epoch": 4.194174757281553,
"grad_norm": 0.1217502825230698,
"learning_rate": 6.177526778404663e-06,
"loss": 0.4679,
"step": 432
},
{
"epoch": 4.203883495145631,
"grad_norm": 0.11887732797821776,
"learning_rate": 6.033405781018195e-06,
"loss": 0.4655,
"step": 433
},
{
"epoch": 4.213592233009709,
"grad_norm": 0.12496115598562435,
"learning_rate": 5.8908486084459134e-06,
"loss": 0.4635,
"step": 434
},
{
"epoch": 4.223300970873787,
"grad_norm": 0.12470991178018197,
"learning_rate": 5.74986182403189e-06,
"loss": 0.4646,
"step": 435
},
{
"epoch": 4.233009708737864,
"grad_norm": 0.1292310303978258,
"learning_rate": 5.610451918819357e-06,
"loss": 0.4651,
"step": 436
},
{
"epoch": 4.242718446601942,
"grad_norm": 0.13694613972829348,
"learning_rate": 5.472625311251918e-06,
"loss": 0.4667,
"step": 437
},
{
"epoch": 4.252427184466019,
"grad_norm": 0.12260186181424092,
"learning_rate": 5.336388346878006e-06,
"loss": 0.4654,
"step": 438
},
{
"epoch": 4.262135922330097,
"grad_norm": 0.12523666490053748,
"learning_rate": 5.201747298058765e-06,
"loss": 0.4604,
"step": 439
},
{
"epoch": 4.271844660194175,
"grad_norm": 0.12451383039630047,
"learning_rate": 5.068708363679249e-06,
"loss": 0.4645,
"step": 440
},
{
"epoch": 4.281553398058253,
"grad_norm": 0.10990810773913537,
"learning_rate": 4.937277668863014e-06,
"loss": 0.4647,
"step": 441
},
{
"epoch": 4.29126213592233,
"grad_norm": 0.10982607372347657,
"learning_rate": 4.807461264690157e-06,
"loss": 0.465,
"step": 442
},
{
"epoch": 4.300970873786408,
"grad_norm": 0.12022614154318102,
"learning_rate": 4.67926512791868e-06,
"loss": 0.4654,
"step": 443
},
{
"epoch": 4.310679611650485,
"grad_norm": 0.10923349467562726,
"learning_rate": 4.552695160709362e-06,
"loss": 0.466,
"step": 444
},
{
"epoch": 4.320388349514563,
"grad_norm": 0.10653612984033727,
"learning_rate": 4.427757190353976e-06,
"loss": 0.4684,
"step": 445
},
{
"epoch": 4.330097087378641,
"grad_norm": 0.10730397458255132,
"learning_rate": 4.304456969007049e-06,
"loss": 0.4657,
"step": 446
},
{
"epoch": 4.339805825242719,
"grad_norm": 0.1003612678568525,
"learning_rate": 4.182800173420991e-06,
"loss": 0.4649,
"step": 447
},
{
"epoch": 4.349514563106796,
"grad_norm": 0.11090894330673261,
"learning_rate": 4.06279240468475e-06,
"loss": 0.4631,
"step": 448
},
{
"epoch": 4.359223300970874,
"grad_norm": 0.1167058523870307,
"learning_rate": 3.9444391879659604e-06,
"loss": 0.4665,
"step": 449
},
{
"epoch": 4.368932038834951,
"grad_norm": 0.10433976260435962,
"learning_rate": 3.827745972256529e-06,
"loss": 0.4659,
"step": 450
},
{
"epoch": 4.378640776699029,
"grad_norm": 0.10011905909384386,
"learning_rate": 3.7127181301217817e-06,
"loss": 0.4641,
"step": 451
},
{
"epoch": 4.388349514563107,
"grad_norm": 0.10655509962726566,
"learning_rate": 3.599360957453102e-06,
"loss": 0.467,
"step": 452
},
{
"epoch": 4.398058252427185,
"grad_norm": 0.10938664062060237,
"learning_rate": 3.487679673224129e-06,
"loss": 0.4607,
"step": 453
},
{
"epoch": 4.407766990291262,
"grad_norm": 0.10191086413678936,
"learning_rate": 3.3776794192504412e-06,
"loss": 0.4623,
"step": 454
},
{
"epoch": 4.41747572815534,
"grad_norm": 0.09548255692426758,
"learning_rate": 3.269365259952859e-06,
"loss": 0.4626,
"step": 455
},
{
"epoch": 4.427184466019417,
"grad_norm": 0.10109872749133958,
"learning_rate": 3.1627421821242586e-06,
"loss": 0.4627,
"step": 456
},
{
"epoch": 4.436893203883495,
"grad_norm": 0.1003445240191667,
"learning_rate": 3.0578150946999695e-06,
"loss": 0.4637,
"step": 457
},
{
"epoch": 4.446601941747573,
"grad_norm": 0.10094079472060287,
"learning_rate": 2.954588828531817e-06,
"loss": 0.4671,
"step": 458
},
{
"epoch": 4.456310679611651,
"grad_norm": 0.09877743367390623,
"learning_rate": 2.8530681361656422e-06,
"loss": 0.4649,
"step": 459
},
{
"epoch": 4.466019417475728,
"grad_norm": 0.09360681988866616,
"learning_rate": 2.7532576916225395e-06,
"loss": 0.4678,
"step": 460
},
{
"epoch": 4.475728155339806,
"grad_norm": 0.09976388673239875,
"learning_rate": 2.6551620901836515e-06,
"loss": 0.4645,
"step": 461
},
{
"epoch": 4.485436893203883,
"grad_norm": 0.0965607126962889,
"learning_rate": 2.5587858481786086e-06,
"loss": 0.4664,
"step": 462
},
{
"epoch": 4.495145631067961,
"grad_norm": 0.09897406466321508,
"learning_rate": 2.4641334027775755e-06,
"loss": 0.467,
"step": 463
},
{
"epoch": 4.504854368932039,
"grad_norm": 0.10396092442681994,
"learning_rate": 2.371209111786987e-06,
"loss": 0.468,
"step": 464
},
{
"epoch": 4.514563106796117,
"grad_norm": 0.0948373657066504,
"learning_rate": 2.280017253448916e-06,
"loss": 0.4642,
"step": 465
},
{
"epoch": 4.524271844660194,
"grad_norm": 0.09507655846542853,
"learning_rate": 2.190562026244072e-06,
"loss": 0.4669,
"step": 466
},
{
"epoch": 4.533980582524272,
"grad_norm": 0.09659880423736872,
"learning_rate": 2.102847548698539e-06,
"loss": 0.4642,
"step": 467
},
{
"epoch": 4.543689320388349,
"grad_norm": 0.09162921938645756,
"learning_rate": 2.0168778591941242e-06,
"loss": 0.4694,
"step": 468
},
{
"epoch": 4.553398058252427,
"grad_norm": 0.08943623447949108,
"learning_rate": 1.9326569157824736e-06,
"loss": 0.4654,
"step": 469
},
{
"epoch": 4.563106796116505,
"grad_norm": 0.0958809139961805,
"learning_rate": 1.850188596002802e-06,
"loss": 0.4638,
"step": 470
},
{
"epoch": 4.572815533980583,
"grad_norm": 0.0928325035695772,
"learning_rate": 1.7694766967033805e-06,
"loss": 0.4699,
"step": 471
},
{
"epoch": 4.58252427184466,
"grad_norm": 0.09326809079774372,
"learning_rate": 1.6905249338667617e-06,
"loss": 0.4654,
"step": 472
},
{
"epoch": 4.592233009708738,
"grad_norm": 0.08894714059682778,
"learning_rate": 1.613336942438637e-06,
"loss": 0.4678,
"step": 473
},
{
"epoch": 4.601941747572815,
"grad_norm": 0.09702666150761258,
"learning_rate": 1.5379162761605427e-06,
"loss": 0.4642,
"step": 474
},
{
"epoch": 4.611650485436893,
"grad_norm": 0.0925613181306804,
"learning_rate": 1.4642664074061962e-06,
"loss": 0.463,
"step": 475
},
{
"epoch": 4.621359223300971,
"grad_norm": 0.0934935785749229,
"learning_rate": 1.3923907270216819e-06,
"loss": 0.4636,
"step": 476
},
{
"epoch": 4.631067961165049,
"grad_norm": 0.08692636915664595,
"learning_rate": 1.3222925441692635e-06,
"loss": 0.4656,
"step": 477
},
{
"epoch": 4.640776699029126,
"grad_norm": 0.08592627332074977,
"learning_rate": 1.2539750861751031e-06,
"loss": 0.464,
"step": 478
},
{
"epoch": 4.650485436893204,
"grad_norm": 0.08826705670035949,
"learning_rate": 1.1874414983806283e-06,
"loss": 0.4669,
"step": 479
},
{
"epoch": 4.660194174757281,
"grad_norm": 0.08820160855473339,
"learning_rate": 1.1226948439977314e-06,
"loss": 0.4631,
"step": 480
},
{
"epoch": 4.669902912621359,
"grad_norm": 0.09622872398963238,
"learning_rate": 1.0597381039677646e-06,
"loss": 0.4686,
"step": 481
},
{
"epoch": 4.679611650485437,
"grad_norm": 0.08913032971687475,
"learning_rate": 9.985741768242429e-07,
"loss": 0.4647,
"step": 482
},
{
"epoch": 4.689320388349515,
"grad_norm": 0.09028397610583812,
"learning_rate": 9.392058785594504e-07,
"loss": 0.4623,
"step": 483
},
{
"epoch": 4.699029126213592,
"grad_norm": 0.09375615377365946,
"learning_rate": 8.816359424947652e-07,
"loss": 0.4629,
"step": 484
},
{
"epoch": 4.70873786407767,
"grad_norm": 0.09058552987790736,
"learning_rate": 8.258670191548135e-07,
"loss": 0.4633,
"step": 485
},
{
"epoch": 4.718446601941747,
"grad_norm": 0.08822401104082032,
"learning_rate": 7.719016761454479e-07,
"loss": 0.4668,
"step": 486
},
{
"epoch": 4.728155339805825,
"grad_norm": 0.0922329791045878,
"learning_rate": 7.197423980355344e-07,
"loss": 0.4665,
"step": 487
},
{
"epoch": 4.737864077669903,
"grad_norm": 0.08744767491607641,
"learning_rate": 6.693915862425692e-07,
"loss": 0.4657,
"step": 488
},
{
"epoch": 4.747572815533981,
"grad_norm": 0.09683409497038403,
"learning_rate": 6.20851558922091e-07,
"loss": 0.4615,
"step": 489
},
{
"epoch": 4.757281553398058,
"grad_norm": 0.08456750174183916,
"learning_rate": 5.741245508609972e-07,
"loss": 0.4649,
"step": 490
},
{
"epoch": 4.766990291262136,
"grad_norm": 0.08773860548864587,
"learning_rate": 5.292127133746005e-07,
"loss": 0.4653,
"step": 491
},
{
"epoch": 4.776699029126213,
"grad_norm": 0.0917555007099176,
"learning_rate": 4.861181142076276e-07,
"loss": 0.4656,
"step": 492
},
{
"epoch": 4.786407766990291,
"grad_norm": 0.08845762170540458,
"learning_rate": 4.448427374389974e-07,
"loss": 0.4692,
"step": 493
},
{
"epoch": 4.796116504854369,
"grad_norm": 0.08827155953592115,
"learning_rate": 4.053884833904809e-07,
"loss": 0.4662,
"step": 494
},
{
"epoch": 4.805825242718447,
"grad_norm": 0.08804630863083254,
"learning_rate": 3.677571685392023e-07,
"loss": 0.4653,
"step": 495
},
{
"epoch": 4.815533980582524,
"grad_norm": 0.09273872362298237,
"learning_rate": 3.319505254340172e-07,
"loss": 0.4674,
"step": 496
},
{
"epoch": 4.825242718446602,
"grad_norm": 0.08937217299785047,
"learning_rate": 2.9797020261574494e-07,
"loss": 0.4675,
"step": 497
},
{
"epoch": 4.834951456310679,
"grad_norm": 0.08269997531774376,
"learning_rate": 2.6581776454126075e-07,
"loss": 0.4683,
"step": 498
},
{
"epoch": 4.844660194174757,
"grad_norm": 0.08810498069517812,
"learning_rate": 2.3549469151149085e-07,
"loss": 0.4658,
"step": 499
},
{
"epoch": 4.854368932038835,
"grad_norm": 0.08729625064317513,
"learning_rate": 2.0700237960322279e-07,
"loss": 0.4696,
"step": 500
},
{
"epoch": 4.864077669902913,
"grad_norm": 0.08416226889517374,
"learning_rate": 1.803421406048589e-07,
"loss": 0.4636,
"step": 501
},
{
"epoch": 4.87378640776699,
"grad_norm": 0.08684248344197969,
"learning_rate": 1.5551520195601577e-07,
"loss": 0.4647,
"step": 502
},
{
"epoch": 4.883495145631068,
"grad_norm": 0.08935580567045806,
"learning_rate": 1.3252270669100953e-07,
"loss": 0.4642,
"step": 503
},
{
"epoch": 4.893203883495145,
"grad_norm": 0.0846095709740149,
"learning_rate": 1.113657133862267e-07,
"loss": 0.4626,
"step": 504
},
{
"epoch": 4.902912621359223,
"grad_norm": 0.08317972288259207,
"learning_rate": 9.204519611138995e-08,
"loss": 0.4628,
"step": 505
},
{
"epoch": 4.9126213592233015,
"grad_norm": 0.0845289066820202,
"learning_rate": 7.45620443847228e-08,
"loss": 0.4657,
"step": 506
},
{
"epoch": 4.922330097087379,
"grad_norm": 0.08611674149409979,
"learning_rate": 5.891706313197354e-08,
"loss": 0.4643,
"step": 507
},
{
"epoch": 4.932038834951456,
"grad_norm": 0.08707702652246953,
"learning_rate": 4.511097264938258e-08,
"loss": 0.4653,
"step": 508
},
{
"epoch": 4.941747572815534,
"grad_norm": 0.08653800248757759,
"learning_rate": 3.314440857049572e-08,
"loss": 0.4679,
"step": 509
},
{
"epoch": 4.951456310679612,
"grad_norm": 0.08635221643773852,
"learning_rate": 2.3017921836916425e-08,
"loss": 0.4627,
"step": 510
},
{
"epoch": 4.961165048543689,
"grad_norm": 0.08744848791920268,
"learning_rate": 1.4731978672939407e-08,
"loss": 0.4664,
"step": 511
},
{
"epoch": 4.970873786407767,
"grad_norm": 0.08802432305414154,
"learning_rate": 8.286960564065639e-09,
"loss": 0.4628,
"step": 512
},
{
"epoch": 4.980582524271845,
"grad_norm": 0.08116167073862661,
"learning_rate": 3.683164239469683e-09,
"loss": 0.4637,
"step": 513
},
{
"epoch": 4.990291262135923,
"grad_norm": 0.08686945147014129,
"learning_rate": 9.208016583128754e-10,
"loss": 0.4606,
"step": 514
},
{
"epoch": 5.0,
"grad_norm": 0.08703184321831914,
"learning_rate": 0.0,
"loss": 0.4639,
"step": 515
},
{
"epoch": 5.0,
"step": 515,
"total_flos": 8639713262960640.0,
"train_loss": 0.0,
"train_runtime": 13.1123,
"train_samples_per_second": 20070.162,
"train_steps_per_second": 39.276
}
],
"logging_steps": 1,
"max_steps": 515,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8639713262960640.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}