no_pipeline_math_300k / trainer_state.json
neginr's picture
End of training
201cce8 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.991735537190083,
"eval_steps": 500,
"global_step": 715,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006959547629404089,
"grad_norm": 6.299945556626471,
"learning_rate": 1.111111111111111e-06,
"loss": 0.8825,
"step": 1
},
{
"epoch": 0.013919095258808177,
"grad_norm": 6.257876699073014,
"learning_rate": 2.222222222222222e-06,
"loss": 0.8704,
"step": 2
},
{
"epoch": 0.020878642888212267,
"grad_norm": 6.160373581422307,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.8713,
"step": 3
},
{
"epoch": 0.027838190517616355,
"grad_norm": 5.78775826291232,
"learning_rate": 4.444444444444444e-06,
"loss": 0.8598,
"step": 4
},
{
"epoch": 0.034797738147020446,
"grad_norm": 4.434647119841161,
"learning_rate": 5.555555555555557e-06,
"loss": 0.8172,
"step": 5
},
{
"epoch": 0.041757285776424534,
"grad_norm": 2.3263779022698095,
"learning_rate": 6.666666666666667e-06,
"loss": 0.7532,
"step": 6
},
{
"epoch": 0.04871683340582862,
"grad_norm": 4.051177439739189,
"learning_rate": 7.77777777777778e-06,
"loss": 0.7557,
"step": 7
},
{
"epoch": 0.05567638103523271,
"grad_norm": 4.175202353129295,
"learning_rate": 8.888888888888888e-06,
"loss": 0.7655,
"step": 8
},
{
"epoch": 0.0626359286646368,
"grad_norm": 3.8871866859374617,
"learning_rate": 1e-05,
"loss": 0.7274,
"step": 9
},
{
"epoch": 0.06959547629404089,
"grad_norm": 4.009248099328964,
"learning_rate": 1.1111111111111113e-05,
"loss": 0.6947,
"step": 10
},
{
"epoch": 0.07655502392344497,
"grad_norm": 3.2381642347145796,
"learning_rate": 1.2222222222222224e-05,
"loss": 0.6795,
"step": 11
},
{
"epoch": 0.08351457155284907,
"grad_norm": 2.0504476986085827,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.6602,
"step": 12
},
{
"epoch": 0.09047411918225315,
"grad_norm": 2.384280645275452,
"learning_rate": 1.4444444444444446e-05,
"loss": 0.6394,
"step": 13
},
{
"epoch": 0.09743366681165724,
"grad_norm": 2.419324966834746,
"learning_rate": 1.555555555555556e-05,
"loss": 0.6282,
"step": 14
},
{
"epoch": 0.10439321444106132,
"grad_norm": 1.4468314839239673,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.6092,
"step": 15
},
{
"epoch": 0.11135276207046542,
"grad_norm": 1.1678600409520985,
"learning_rate": 1.7777777777777777e-05,
"loss": 0.5894,
"step": 16
},
{
"epoch": 0.11831230969986951,
"grad_norm": 1.264682985827519,
"learning_rate": 1.888888888888889e-05,
"loss": 0.5968,
"step": 17
},
{
"epoch": 0.1252718573292736,
"grad_norm": 0.6754725868857698,
"learning_rate": 2e-05,
"loss": 0.576,
"step": 18
},
{
"epoch": 0.1322314049586777,
"grad_norm": 0.9074270406226251,
"learning_rate": 2.1111111111111114e-05,
"loss": 0.5698,
"step": 19
},
{
"epoch": 0.13919095258808178,
"grad_norm": 0.7979533790293932,
"learning_rate": 2.2222222222222227e-05,
"loss": 0.5588,
"step": 20
},
{
"epoch": 0.14615050021748585,
"grad_norm": 0.6581363594356142,
"learning_rate": 2.3333333333333336e-05,
"loss": 0.5606,
"step": 21
},
{
"epoch": 0.15311004784688995,
"grad_norm": 0.6691967855765695,
"learning_rate": 2.444444444444445e-05,
"loss": 0.5481,
"step": 22
},
{
"epoch": 0.16006959547629404,
"grad_norm": 0.542444666355929,
"learning_rate": 2.5555555555555554e-05,
"loss": 0.5539,
"step": 23
},
{
"epoch": 0.16702914310569814,
"grad_norm": 0.6416430096126567,
"learning_rate": 2.6666666666666667e-05,
"loss": 0.5501,
"step": 24
},
{
"epoch": 0.17398869073510223,
"grad_norm": 0.5329567057613817,
"learning_rate": 2.777777777777778e-05,
"loss": 0.5342,
"step": 25
},
{
"epoch": 0.1809482383645063,
"grad_norm": 0.6011450434974139,
"learning_rate": 2.888888888888889e-05,
"loss": 0.5348,
"step": 26
},
{
"epoch": 0.1879077859939104,
"grad_norm": 0.4976703306853586,
"learning_rate": 3.0000000000000004e-05,
"loss": 0.5322,
"step": 27
},
{
"epoch": 0.1948673336233145,
"grad_norm": 0.5730627506660203,
"learning_rate": 3.111111111111112e-05,
"loss": 0.5213,
"step": 28
},
{
"epoch": 0.20182688125271858,
"grad_norm": 0.7301409032698557,
"learning_rate": 3.222222222222223e-05,
"loss": 0.5206,
"step": 29
},
{
"epoch": 0.20878642888212265,
"grad_norm": 1.4025503659857947,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.5322,
"step": 30
},
{
"epoch": 0.21574597651152674,
"grad_norm": 0.8305463760946818,
"learning_rate": 3.444444444444445e-05,
"loss": 0.5176,
"step": 31
},
{
"epoch": 0.22270552414093084,
"grad_norm": 0.8468215550610021,
"learning_rate": 3.555555555555555e-05,
"loss": 0.5187,
"step": 32
},
{
"epoch": 0.22966507177033493,
"grad_norm": 0.8897899781711042,
"learning_rate": 3.6666666666666666e-05,
"loss": 0.5206,
"step": 33
},
{
"epoch": 0.23662461939973903,
"grad_norm": 1.1213311013190945,
"learning_rate": 3.777777777777778e-05,
"loss": 0.5035,
"step": 34
},
{
"epoch": 0.2435841670291431,
"grad_norm": 1.1010420447489897,
"learning_rate": 3.888888888888889e-05,
"loss": 0.5072,
"step": 35
},
{
"epoch": 0.2505437146585472,
"grad_norm": 0.773476718518657,
"learning_rate": 4e-05,
"loss": 0.5077,
"step": 36
},
{
"epoch": 0.2575032622879513,
"grad_norm": 1.2400452716206256,
"learning_rate": 4.111111111111111e-05,
"loss": 0.5033,
"step": 37
},
{
"epoch": 0.2644628099173554,
"grad_norm": 0.9153450607625541,
"learning_rate": 4.222222222222223e-05,
"loss": 0.5,
"step": 38
},
{
"epoch": 0.2714223575467595,
"grad_norm": 0.6514251195810624,
"learning_rate": 4.3333333333333334e-05,
"loss": 0.4947,
"step": 39
},
{
"epoch": 0.27838190517616357,
"grad_norm": 1.0042919223967974,
"learning_rate": 4.444444444444445e-05,
"loss": 0.5042,
"step": 40
},
{
"epoch": 0.28534145280556766,
"grad_norm": 1.0797096325295303,
"learning_rate": 4.555555555555556e-05,
"loss": 0.4916,
"step": 41
},
{
"epoch": 0.2923010004349717,
"grad_norm": 0.9905878602993525,
"learning_rate": 4.666666666666667e-05,
"loss": 0.496,
"step": 42
},
{
"epoch": 0.2992605480643758,
"grad_norm": 0.8676969083941743,
"learning_rate": 4.777777777777778e-05,
"loss": 0.4885,
"step": 43
},
{
"epoch": 0.3062200956937799,
"grad_norm": 0.7260235825278305,
"learning_rate": 4.88888888888889e-05,
"loss": 0.5005,
"step": 44
},
{
"epoch": 0.313179643323184,
"grad_norm": 0.9223906328687149,
"learning_rate": 5e-05,
"loss": 0.4892,
"step": 45
},
{
"epoch": 0.3201391909525881,
"grad_norm": 1.3832086370219072,
"learning_rate": 5.111111111111111e-05,
"loss": 0.4997,
"step": 46
},
{
"epoch": 0.3270987385819922,
"grad_norm": 0.8419161235221501,
"learning_rate": 5.222222222222223e-05,
"loss": 0.4938,
"step": 47
},
{
"epoch": 0.33405828621139627,
"grad_norm": 1.6195340381200483,
"learning_rate": 5.333333333333333e-05,
"loss": 0.4968,
"step": 48
},
{
"epoch": 0.34101783384080037,
"grad_norm": 0.9275367275665349,
"learning_rate": 5.444444444444445e-05,
"loss": 0.5006,
"step": 49
},
{
"epoch": 0.34797738147020446,
"grad_norm": 1.7543919560909402,
"learning_rate": 5.555555555555556e-05,
"loss": 0.5007,
"step": 50
},
{
"epoch": 0.3549369290996085,
"grad_norm": 1.26313567421099,
"learning_rate": 5.666666666666668e-05,
"loss": 0.499,
"step": 51
},
{
"epoch": 0.3618964767290126,
"grad_norm": 1.8899821239654098,
"learning_rate": 5.777777777777778e-05,
"loss": 0.4868,
"step": 52
},
{
"epoch": 0.3688560243584167,
"grad_norm": 1.4836204192299145,
"learning_rate": 5.8888888888888896e-05,
"loss": 0.4924,
"step": 53
},
{
"epoch": 0.3758155719878208,
"grad_norm": 1.5371932375940351,
"learning_rate": 6.000000000000001e-05,
"loss": 0.4853,
"step": 54
},
{
"epoch": 0.3827751196172249,
"grad_norm": 1.066878304885815,
"learning_rate": 6.111111111111111e-05,
"loss": 0.4823,
"step": 55
},
{
"epoch": 0.389734667246629,
"grad_norm": 1.234430905173555,
"learning_rate": 6.222222222222223e-05,
"loss": 0.4848,
"step": 56
},
{
"epoch": 0.39669421487603307,
"grad_norm": 1.0923409666404706,
"learning_rate": 6.333333333333333e-05,
"loss": 0.494,
"step": 57
},
{
"epoch": 0.40365376250543716,
"grad_norm": 0.9800617899091041,
"learning_rate": 6.444444444444446e-05,
"loss": 0.4825,
"step": 58
},
{
"epoch": 0.41061331013484126,
"grad_norm": 0.9212766482645198,
"learning_rate": 6.555555555555556e-05,
"loss": 0.4691,
"step": 59
},
{
"epoch": 0.4175728577642453,
"grad_norm": 1.167155227826628,
"learning_rate": 6.666666666666667e-05,
"loss": 0.4815,
"step": 60
},
{
"epoch": 0.4245324053936494,
"grad_norm": 1.5791917226157102,
"learning_rate": 6.777777777777778e-05,
"loss": 0.4943,
"step": 61
},
{
"epoch": 0.4314919530230535,
"grad_norm": 1.0997180103791135,
"learning_rate": 6.88888888888889e-05,
"loss": 0.4871,
"step": 62
},
{
"epoch": 0.4384515006524576,
"grad_norm": 1.2130023175807059,
"learning_rate": 7.000000000000001e-05,
"loss": 0.4855,
"step": 63
},
{
"epoch": 0.4454110482818617,
"grad_norm": 1.6270954136094906,
"learning_rate": 7.11111111111111e-05,
"loss": 0.4877,
"step": 64
},
{
"epoch": 0.45237059591126577,
"grad_norm": 1.1304632174516827,
"learning_rate": 7.222222222222223e-05,
"loss": 0.4795,
"step": 65
},
{
"epoch": 0.45933014354066987,
"grad_norm": 1.32786525260077,
"learning_rate": 7.333333333333333e-05,
"loss": 0.4815,
"step": 66
},
{
"epoch": 0.46628969117007396,
"grad_norm": 0.6938586247846547,
"learning_rate": 7.444444444444446e-05,
"loss": 0.4711,
"step": 67
},
{
"epoch": 0.47324923879947806,
"grad_norm": 1.3238232457797845,
"learning_rate": 7.555555555555556e-05,
"loss": 0.4823,
"step": 68
},
{
"epoch": 0.4802087864288821,
"grad_norm": 0.7731792383221893,
"learning_rate": 7.666666666666668e-05,
"loss": 0.4769,
"step": 69
},
{
"epoch": 0.4871683340582862,
"grad_norm": 0.714435674612326,
"learning_rate": 7.777777777777778e-05,
"loss": 0.47,
"step": 70
},
{
"epoch": 0.4941278816876903,
"grad_norm": 0.727161798048739,
"learning_rate": 7.88888888888889e-05,
"loss": 0.4748,
"step": 71
},
{
"epoch": 0.5010874293170944,
"grad_norm": 0.7822239107856425,
"learning_rate": 8e-05,
"loss": 0.4735,
"step": 72
},
{
"epoch": 0.5080469769464985,
"grad_norm": 0.9159063781364695,
"learning_rate": 7.999952257304926e-05,
"loss": 0.4585,
"step": 73
},
{
"epoch": 0.5150065245759026,
"grad_norm": 1.4014617788300159,
"learning_rate": 7.99980903035939e-05,
"loss": 0.4817,
"step": 74
},
{
"epoch": 0.5219660722053067,
"grad_norm": 0.9697910698942601,
"learning_rate": 7.999570322582408e-05,
"loss": 0.4719,
"step": 75
},
{
"epoch": 0.5289256198347108,
"grad_norm": 1.2780959714818068,
"learning_rate": 7.99923613967226e-05,
"loss": 0.4744,
"step": 76
},
{
"epoch": 0.5358851674641149,
"grad_norm": 0.9675381526583897,
"learning_rate": 7.99880648960634e-05,
"loss": 0.4704,
"step": 77
},
{
"epoch": 0.542844715093519,
"grad_norm": 1.047833737067459,
"learning_rate": 7.998281382640975e-05,
"loss": 0.4654,
"step": 78
},
{
"epoch": 0.549804262722923,
"grad_norm": 1.2845937442452149,
"learning_rate": 7.997660831311176e-05,
"loss": 0.475,
"step": 79
},
{
"epoch": 0.5567638103523271,
"grad_norm": 0.8772171829670746,
"learning_rate": 7.996944850430339e-05,
"loss": 0.4656,
"step": 80
},
{
"epoch": 0.5637233579817312,
"grad_norm": 0.741967780268622,
"learning_rate": 7.996133457089894e-05,
"loss": 0.4575,
"step": 81
},
{
"epoch": 0.5706829056111353,
"grad_norm": 0.8708734610216243,
"learning_rate": 7.99522667065889e-05,
"loss": 0.4673,
"step": 82
},
{
"epoch": 0.5776424532405393,
"grad_norm": 0.9611160209126256,
"learning_rate": 7.994224512783544e-05,
"loss": 0.4644,
"step": 83
},
{
"epoch": 0.5846020008699434,
"grad_norm": 1.2059285045807202,
"learning_rate": 7.993127007386715e-05,
"loss": 0.4782,
"step": 84
},
{
"epoch": 0.5915615484993475,
"grad_norm": 1.0796995800628297,
"learning_rate": 7.991934180667333e-05,
"loss": 0.4642,
"step": 85
},
{
"epoch": 0.5985210961287516,
"grad_norm": 1.0316521924490913,
"learning_rate": 7.990646061099782e-05,
"loss": 0.4646,
"step": 86
},
{
"epoch": 0.6054806437581557,
"grad_norm": 0.8832150277973638,
"learning_rate": 7.989262679433211e-05,
"loss": 0.4626,
"step": 87
},
{
"epoch": 0.6124401913875598,
"grad_norm": 0.7634910217249218,
"learning_rate": 7.987784068690804e-05,
"loss": 0.4626,
"step": 88
},
{
"epoch": 0.6193997390169639,
"grad_norm": 1.1086418661133017,
"learning_rate": 7.986210264168991e-05,
"loss": 0.4521,
"step": 89
},
{
"epoch": 0.626359286646368,
"grad_norm": 0.6778528235443292,
"learning_rate": 7.98454130343661e-05,
"loss": 0.4606,
"step": 90
},
{
"epoch": 0.6333188342757721,
"grad_norm": 0.7098255147206154,
"learning_rate": 7.982777226334e-05,
"loss": 0.4546,
"step": 91
},
{
"epoch": 0.6402783819051762,
"grad_norm": 0.7512375219693761,
"learning_rate": 7.980918074972059e-05,
"loss": 0.4526,
"step": 92
},
{
"epoch": 0.6472379295345803,
"grad_norm": 0.4955536043933238,
"learning_rate": 7.978963893731235e-05,
"loss": 0.4514,
"step": 93
},
{
"epoch": 0.6541974771639844,
"grad_norm": 0.6854584128464718,
"learning_rate": 7.976914729260468e-05,
"loss": 0.4656,
"step": 94
},
{
"epoch": 0.6611570247933884,
"grad_norm": 0.6020857806767794,
"learning_rate": 7.974770630476077e-05,
"loss": 0.4539,
"step": 95
},
{
"epoch": 0.6681165724227925,
"grad_norm": 0.5198959190719997,
"learning_rate": 7.972531648560587e-05,
"loss": 0.4522,
"step": 96
},
{
"epoch": 0.6750761200521966,
"grad_norm": 0.8318026218834386,
"learning_rate": 7.970197836961513e-05,
"loss": 0.4623,
"step": 97
},
{
"epoch": 0.6820356676816007,
"grad_norm": 0.9109802442285713,
"learning_rate": 7.967769251390083e-05,
"loss": 0.4559,
"step": 98
},
{
"epoch": 0.6889952153110048,
"grad_norm": 1.25243965937425,
"learning_rate": 7.96524594981991e-05,
"loss": 0.4626,
"step": 99
},
{
"epoch": 0.6959547629404089,
"grad_norm": 0.868516955234305,
"learning_rate": 7.9626279924856e-05,
"loss": 0.4569,
"step": 100
},
{
"epoch": 0.702914310569813,
"grad_norm": 0.48520097270425405,
"learning_rate": 7.959915441881322e-05,
"loss": 0.4515,
"step": 101
},
{
"epoch": 0.709873858199217,
"grad_norm": 0.5480517510335293,
"learning_rate": 7.957108362759316e-05,
"loss": 0.4544,
"step": 102
},
{
"epoch": 0.7168334058286211,
"grad_norm": 0.8911184240263139,
"learning_rate": 7.954206822128343e-05,
"loss": 0.4635,
"step": 103
},
{
"epoch": 0.7237929534580252,
"grad_norm": 0.8227526938281489,
"learning_rate": 7.951210889252088e-05,
"loss": 0.465,
"step": 104
},
{
"epoch": 0.7307525010874293,
"grad_norm": 0.5558210684070918,
"learning_rate": 7.948120635647503e-05,
"loss": 0.4487,
"step": 105
},
{
"epoch": 0.7377120487168334,
"grad_norm": 0.6355162909760532,
"learning_rate": 7.944936135083108e-05,
"loss": 0.4523,
"step": 106
},
{
"epoch": 0.7446715963462375,
"grad_norm": 0.6105345680130448,
"learning_rate": 7.941657463577225e-05,
"loss": 0.4575,
"step": 107
},
{
"epoch": 0.7516311439756416,
"grad_norm": 0.5678069745935661,
"learning_rate": 7.938284699396157e-05,
"loss": 0.4498,
"step": 108
},
{
"epoch": 0.7585906916050457,
"grad_norm": 0.5483024912339128,
"learning_rate": 7.934817923052331e-05,
"loss": 0.4549,
"step": 109
},
{
"epoch": 0.7655502392344498,
"grad_norm": 0.3929806004224007,
"learning_rate": 7.931257217302371e-05,
"loss": 0.4504,
"step": 110
},
{
"epoch": 0.7725097868638539,
"grad_norm": 0.5681787692060095,
"learning_rate": 7.927602667145121e-05,
"loss": 0.4477,
"step": 111
},
{
"epoch": 0.779469334493258,
"grad_norm": 0.556711524840673,
"learning_rate": 7.923854359819619e-05,
"loss": 0.4484,
"step": 112
},
{
"epoch": 0.786428882122662,
"grad_norm": 0.4138699309021785,
"learning_rate": 7.92001238480301e-05,
"loss": 0.447,
"step": 113
},
{
"epoch": 0.7933884297520661,
"grad_norm": 0.6357342110964699,
"learning_rate": 7.916076833808414e-05,
"loss": 0.4513,
"step": 114
},
{
"epoch": 0.8003479773814702,
"grad_norm": 0.8584704922958183,
"learning_rate": 7.91204780078274e-05,
"loss": 0.4427,
"step": 115
},
{
"epoch": 0.8073075250108743,
"grad_norm": 0.9871565259991888,
"learning_rate": 7.907925381904432e-05,
"loss": 0.4554,
"step": 116
},
{
"epoch": 0.8142670726402784,
"grad_norm": 1.0217097625637481,
"learning_rate": 7.903709675581185e-05,
"loss": 0.453,
"step": 117
},
{
"epoch": 0.8212266202696825,
"grad_norm": 0.7895770598500398,
"learning_rate": 7.899400782447591e-05,
"loss": 0.4541,
"step": 118
},
{
"epoch": 0.8281861678990866,
"grad_norm": 0.5874040536712771,
"learning_rate": 7.894998805362737e-05,
"loss": 0.4423,
"step": 119
},
{
"epoch": 0.8351457155284906,
"grad_norm": 0.6690541560849889,
"learning_rate": 7.890503849407742e-05,
"loss": 0.4519,
"step": 120
},
{
"epoch": 0.8421052631578947,
"grad_norm": 0.6865319922768905,
"learning_rate": 7.885916021883268e-05,
"loss": 0.4455,
"step": 121
},
{
"epoch": 0.8490648107872988,
"grad_norm": 0.5200700598023363,
"learning_rate": 7.881235432306936e-05,
"loss": 0.4407,
"step": 122
},
{
"epoch": 0.8560243584167029,
"grad_norm": 0.4344010301700919,
"learning_rate": 7.876462192410727e-05,
"loss": 0.4465,
"step": 123
},
{
"epoch": 0.862983906046107,
"grad_norm": 0.6677648559236202,
"learning_rate": 7.871596416138312e-05,
"loss": 0.4497,
"step": 124
},
{
"epoch": 0.8699434536755111,
"grad_norm": 0.4959133192072294,
"learning_rate": 7.866638219642324e-05,
"loss": 0.4412,
"step": 125
},
{
"epoch": 0.8769030013049152,
"grad_norm": 0.3302086455708142,
"learning_rate": 7.861587721281598e-05,
"loss": 0.4341,
"step": 126
},
{
"epoch": 0.8838625489343193,
"grad_norm": 0.4127575574993749,
"learning_rate": 7.856445041618333e-05,
"loss": 0.4403,
"step": 127
},
{
"epoch": 0.8908220965637234,
"grad_norm": 0.4079306482235732,
"learning_rate": 7.851210303415225e-05,
"loss": 0.45,
"step": 128
},
{
"epoch": 0.8977816441931274,
"grad_norm": 0.3629235803836879,
"learning_rate": 7.845883631632527e-05,
"loss": 0.4371,
"step": 129
},
{
"epoch": 0.9047411918225315,
"grad_norm": 0.3281071553183999,
"learning_rate": 7.840465153425074e-05,
"loss": 0.4342,
"step": 130
},
{
"epoch": 0.9117007394519356,
"grad_norm": 0.3939672640729078,
"learning_rate": 7.83495499813924e-05,
"loss": 0.4393,
"step": 131
},
{
"epoch": 0.9186602870813397,
"grad_norm": 0.4669777553758287,
"learning_rate": 7.829353297309857e-05,
"loss": 0.4378,
"step": 132
},
{
"epoch": 0.9256198347107438,
"grad_norm": 0.5191309992385434,
"learning_rate": 7.823660184657075e-05,
"loss": 0.4419,
"step": 133
},
{
"epoch": 0.9325793823401479,
"grad_norm": 0.4371861252468821,
"learning_rate": 7.817875796083164e-05,
"loss": 0.4442,
"step": 134
},
{
"epoch": 0.939538929969552,
"grad_norm": 0.6968595662983466,
"learning_rate": 7.812000269669271e-05,
"loss": 0.4448,
"step": 135
},
{
"epoch": 0.9464984775989561,
"grad_norm": 0.9915151230783144,
"learning_rate": 7.806033745672132e-05,
"loss": 0.4459,
"step": 136
},
{
"epoch": 0.9534580252283602,
"grad_norm": 1.1614516148599872,
"learning_rate": 7.799976366520714e-05,
"loss": 0.4458,
"step": 137
},
{
"epoch": 0.9604175728577642,
"grad_norm": 0.7103915477953243,
"learning_rate": 7.793828276812819e-05,
"loss": 0.4413,
"step": 138
},
{
"epoch": 0.9673771204871683,
"grad_norm": 0.6607811748107033,
"learning_rate": 7.787589623311635e-05,
"loss": 0.4374,
"step": 139
},
{
"epoch": 0.9743366681165724,
"grad_norm": 0.669847981333871,
"learning_rate": 7.781260554942226e-05,
"loss": 0.4452,
"step": 140
},
{
"epoch": 0.9812962157459765,
"grad_norm": 0.47755885581992924,
"learning_rate": 7.774841222787983e-05,
"loss": 0.4439,
"step": 141
},
{
"epoch": 0.9882557633753806,
"grad_norm": 0.4723668370103968,
"learning_rate": 7.768331780087017e-05,
"loss": 0.4462,
"step": 142
},
{
"epoch": 0.9952153110047847,
"grad_norm": 0.5073519260421897,
"learning_rate": 7.761732382228494e-05,
"loss": 0.4406,
"step": 143
},
{
"epoch": 1.0060896041757286,
"grad_norm": 0.4347621708894711,
"learning_rate": 7.755043186748936e-05,
"loss": 0.4218,
"step": 144
},
{
"epoch": 1.0130491518051328,
"grad_norm": 0.4797475573338107,
"learning_rate": 7.748264353328451e-05,
"loss": 0.4078,
"step": 145
},
{
"epoch": 1.0200086994345368,
"grad_norm": 0.5279297958588075,
"learning_rate": 7.741396043786929e-05,
"loss": 0.4191,
"step": 146
},
{
"epoch": 1.0269682470639407,
"grad_norm": 0.5909819052892469,
"learning_rate": 7.734438422080174e-05,
"loss": 0.4168,
"step": 147
},
{
"epoch": 1.033927794693345,
"grad_norm": 0.6495659004492463,
"learning_rate": 7.727391654295991e-05,
"loss": 0.4194,
"step": 148
},
{
"epoch": 1.040887342322749,
"grad_norm": 0.587326465562309,
"learning_rate": 7.720255908650222e-05,
"loss": 0.4212,
"step": 149
},
{
"epoch": 1.0478468899521531,
"grad_norm": 0.45607673329416626,
"learning_rate": 7.713031355482734e-05,
"loss": 0.4074,
"step": 150
},
{
"epoch": 1.0548064375815571,
"grad_norm": 0.3531037620398402,
"learning_rate": 7.705718167253345e-05,
"loss": 0.4136,
"step": 151
},
{
"epoch": 1.0617659852109613,
"grad_norm": 0.4297476754808921,
"learning_rate": 7.698316518537713e-05,
"loss": 0.417,
"step": 152
},
{
"epoch": 1.0687255328403653,
"grad_norm": 0.6072156953171759,
"learning_rate": 7.690826586023165e-05,
"loss": 0.4163,
"step": 153
},
{
"epoch": 1.0756850804697695,
"grad_norm": 0.7760172331652697,
"learning_rate": 7.683248548504486e-05,
"loss": 0.4159,
"step": 154
},
{
"epoch": 1.0826446280991735,
"grad_norm": 0.876708332241165,
"learning_rate": 7.675582586879641e-05,
"loss": 0.4192,
"step": 155
},
{
"epoch": 1.0896041757285777,
"grad_norm": 0.9107175938318952,
"learning_rate": 7.667828884145465e-05,
"loss": 0.4145,
"step": 156
},
{
"epoch": 1.0965637233579817,
"grad_norm": 0.8653660027489845,
"learning_rate": 7.65998762539329e-05,
"loss": 0.4218,
"step": 157
},
{
"epoch": 1.103523270987386,
"grad_norm": 0.7850041910149821,
"learning_rate": 7.652058997804532e-05,
"loss": 0.4192,
"step": 158
},
{
"epoch": 1.1104828186167899,
"grad_norm": 0.5313734478672961,
"learning_rate": 7.644043190646211e-05,
"loss": 0.4118,
"step": 159
},
{
"epoch": 1.117442366246194,
"grad_norm": 0.5056894621317538,
"learning_rate": 7.63594039526645e-05,
"loss": 0.4143,
"step": 160
},
{
"epoch": 1.124401913875598,
"grad_norm": 0.766156288599041,
"learning_rate": 7.627750805089888e-05,
"loss": 0.4202,
"step": 161
},
{
"epoch": 1.1313614615050023,
"grad_norm": 0.7173262253497903,
"learning_rate": 7.619474615613083e-05,
"loss": 0.4085,
"step": 162
},
{
"epoch": 1.1383210091344063,
"grad_norm": 0.3801147393208917,
"learning_rate": 7.611112024399829e-05,
"loss": 0.4098,
"step": 163
},
{
"epoch": 1.1452805567638102,
"grad_norm": 0.459213719396466,
"learning_rate": 7.602663231076445e-05,
"loss": 0.4215,
"step": 164
},
{
"epoch": 1.1522401043932144,
"grad_norm": 0.5419991575798397,
"learning_rate": 7.594128437327017e-05,
"loss": 0.4154,
"step": 165
},
{
"epoch": 1.1591996520226187,
"grad_norm": 0.4710423907727317,
"learning_rate": 7.58550784688857e-05,
"loss": 0.4102,
"step": 166
},
{
"epoch": 1.1661591996520226,
"grad_norm": 0.3269293538096053,
"learning_rate": 7.576801665546214e-05,
"loss": 0.4183,
"step": 167
},
{
"epoch": 1.1731187472814266,
"grad_norm": 0.354719558769914,
"learning_rate": 7.568010101128229e-05,
"loss": 0.4083,
"step": 168
},
{
"epoch": 1.1800782949108308,
"grad_norm": 0.41454295254538065,
"learning_rate": 7.559133363501107e-05,
"loss": 0.4073,
"step": 169
},
{
"epoch": 1.1870378425402348,
"grad_norm": 0.41584140556124005,
"learning_rate": 7.550171664564537e-05,
"loss": 0.4184,
"step": 170
},
{
"epoch": 1.193997390169639,
"grad_norm": 0.42301367871950313,
"learning_rate": 7.541125218246346e-05,
"loss": 0.4129,
"step": 171
},
{
"epoch": 1.200956937799043,
"grad_norm": 0.28966866299551014,
"learning_rate": 7.531994240497399e-05,
"loss": 0.4078,
"step": 172
},
{
"epoch": 1.2079164854284472,
"grad_norm": 0.27574788938129696,
"learning_rate": 7.52277894928644e-05,
"loss": 0.4122,
"step": 173
},
{
"epoch": 1.2148760330578512,
"grad_norm": 0.3134277334731802,
"learning_rate": 7.513479564594888e-05,
"loss": 0.4105,
"step": 174
},
{
"epoch": 1.2218355806872554,
"grad_norm": 0.31141216371808733,
"learning_rate": 7.504096308411587e-05,
"loss": 0.4101,
"step": 175
},
{
"epoch": 1.2287951283166594,
"grad_norm": 0.309174676739755,
"learning_rate": 7.494629404727506e-05,
"loss": 0.4099,
"step": 176
},
{
"epoch": 1.2357546759460636,
"grad_norm": 0.3804013842457314,
"learning_rate": 7.485079079530393e-05,
"loss": 0.4065,
"step": 177
},
{
"epoch": 1.2427142235754676,
"grad_norm": 0.4490200434000277,
"learning_rate": 7.47544556079938e-05,
"loss": 0.4178,
"step": 178
},
{
"epoch": 1.2496737712048718,
"grad_norm": 0.5056109077740427,
"learning_rate": 7.465729078499541e-05,
"loss": 0.4175,
"step": 179
},
{
"epoch": 1.2566333188342758,
"grad_norm": 0.5728814822805163,
"learning_rate": 7.455929864576402e-05,
"loss": 0.4003,
"step": 180
},
{
"epoch": 1.26359286646368,
"grad_norm": 0.5364301922656747,
"learning_rate": 7.4460481529504e-05,
"loss": 0.4126,
"step": 181
},
{
"epoch": 1.270552414093084,
"grad_norm": 0.48400372508161793,
"learning_rate": 7.436084179511315e-05,
"loss": 0.4111,
"step": 182
},
{
"epoch": 1.277511961722488,
"grad_norm": 0.48691704971428823,
"learning_rate": 7.426038182112613e-05,
"loss": 0.4192,
"step": 183
},
{
"epoch": 1.2844715093518921,
"grad_norm": 0.4326376870735781,
"learning_rate": 7.415910400565795e-05,
"loss": 0.4071,
"step": 184
},
{
"epoch": 1.2914310569812963,
"grad_norm": 0.3854589354965239,
"learning_rate": 7.405701076634649e-05,
"loss": 0.4132,
"step": 185
},
{
"epoch": 1.2983906046107003,
"grad_norm": 0.3589751690112129,
"learning_rate": 7.395410454029498e-05,
"loss": 0.4141,
"step": 186
},
{
"epoch": 1.3053501522401043,
"grad_norm": 0.3798549380531171,
"learning_rate": 7.385038778401367e-05,
"loss": 0.4109,
"step": 187
},
{
"epoch": 1.3123096998695085,
"grad_norm": 0.5063041641106548,
"learning_rate": 7.374586297336134e-05,
"loss": 0.4121,
"step": 188
},
{
"epoch": 1.3192692474989125,
"grad_norm": 0.5499801181881762,
"learning_rate": 7.364053260348603e-05,
"loss": 0.4131,
"step": 189
},
{
"epoch": 1.3262287951283167,
"grad_norm": 0.538473535319422,
"learning_rate": 7.353439918876565e-05,
"loss": 0.4146,
"step": 190
},
{
"epoch": 1.3331883427577207,
"grad_norm": 0.5133206548316543,
"learning_rate": 7.342746526274779e-05,
"loss": 0.41,
"step": 191
},
{
"epoch": 1.340147890387125,
"grad_norm": 0.4474460367465246,
"learning_rate": 7.331973337808937e-05,
"loss": 0.4122,
"step": 192
},
{
"epoch": 1.3471074380165289,
"grad_norm": 0.44569022437909495,
"learning_rate": 7.321120610649567e-05,
"loss": 0.408,
"step": 193
},
{
"epoch": 1.354066985645933,
"grad_norm": 0.4721240876904583,
"learning_rate": 7.310188603865888e-05,
"loss": 0.4091,
"step": 194
},
{
"epoch": 1.361026533275337,
"grad_norm": 0.4670666827700711,
"learning_rate": 7.299177578419634e-05,
"loss": 0.4092,
"step": 195
},
{
"epoch": 1.3679860809047413,
"grad_norm": 0.3702037932288681,
"learning_rate": 7.288087797158822e-05,
"loss": 0.4097,
"step": 196
},
{
"epoch": 1.3749456285341453,
"grad_norm": 0.330387714721557,
"learning_rate": 7.276919524811472e-05,
"loss": 0.4104,
"step": 197
},
{
"epoch": 1.3819051761635492,
"grad_norm": 0.3875230383248599,
"learning_rate": 7.265673027979295e-05,
"loss": 0.4129,
"step": 198
},
{
"epoch": 1.3888647237929534,
"grad_norm": 0.3612694249655593,
"learning_rate": 7.254348575131328e-05,
"loss": 0.4106,
"step": 199
},
{
"epoch": 1.3958242714223577,
"grad_norm": 0.2763726751232723,
"learning_rate": 7.242946436597518e-05,
"loss": 0.4116,
"step": 200
},
{
"epoch": 1.4027838190517616,
"grad_norm": 0.37611767937513824,
"learning_rate": 7.231466884562275e-05,
"loss": 0.4086,
"step": 201
},
{
"epoch": 1.4097433666811656,
"grad_norm": 0.5071472574190108,
"learning_rate": 7.21991019305798e-05,
"loss": 0.411,
"step": 202
},
{
"epoch": 1.4167029143105698,
"grad_norm": 0.4340052816154503,
"learning_rate": 7.20827663795843e-05,
"loss": 0.4079,
"step": 203
},
{
"epoch": 1.423662461939974,
"grad_norm": 0.3712380468999662,
"learning_rate": 7.19656649697226e-05,
"loss": 0.4056,
"step": 204
},
{
"epoch": 1.430622009569378,
"grad_norm": 0.3225866140544311,
"learning_rate": 7.184780049636318e-05,
"loss": 0.4062,
"step": 205
},
{
"epoch": 1.437581557198782,
"grad_norm": 0.3106099934478539,
"learning_rate": 7.172917577308984e-05,
"loss": 0.4062,
"step": 206
},
{
"epoch": 1.4445411048281862,
"grad_norm": 0.4355158685775427,
"learning_rate": 7.160979363163456e-05,
"loss": 0.4142,
"step": 207
},
{
"epoch": 1.4515006524575902,
"grad_norm": 0.4793783049280065,
"learning_rate": 7.148965692180994e-05,
"loss": 0.399,
"step": 208
},
{
"epoch": 1.4584602000869944,
"grad_norm": 0.3726750358936686,
"learning_rate": 7.136876851144113e-05,
"loss": 0.4132,
"step": 209
},
{
"epoch": 1.4654197477163984,
"grad_norm": 0.31513705424244864,
"learning_rate": 7.124713128629739e-05,
"loss": 0.4058,
"step": 210
},
{
"epoch": 1.4723792953458026,
"grad_norm": 0.3159413572790412,
"learning_rate": 7.11247481500232e-05,
"loss": 0.4041,
"step": 211
},
{
"epoch": 1.4793388429752066,
"grad_norm": 0.33778666639216565,
"learning_rate": 7.100162202406891e-05,
"loss": 0.4147,
"step": 212
},
{
"epoch": 1.4862983906046108,
"grad_norm": 0.3268790661878625,
"learning_rate": 7.08777558476211e-05,
"loss": 0.4086,
"step": 213
},
{
"epoch": 1.4932579382340148,
"grad_norm": 0.32242776651704286,
"learning_rate": 7.075315257753229e-05,
"loss": 0.4148,
"step": 214
},
{
"epoch": 1.500217485863419,
"grad_norm": 0.31567118569033087,
"learning_rate": 7.062781518825047e-05,
"loss": 0.4137,
"step": 215
},
{
"epoch": 1.507177033492823,
"grad_norm": 0.3612184397000591,
"learning_rate": 7.050174667174799e-05,
"loss": 0.4097,
"step": 216
},
{
"epoch": 1.514136581122227,
"grad_norm": 0.4218662687852862,
"learning_rate": 7.037495003745024e-05,
"loss": 0.4084,
"step": 217
},
{
"epoch": 1.5210961287516311,
"grad_norm": 0.45144908695802316,
"learning_rate": 7.024742831216374e-05,
"loss": 0.4123,
"step": 218
},
{
"epoch": 1.5280556763810353,
"grad_norm": 0.4338699229225814,
"learning_rate": 7.011918454000391e-05,
"loss": 0.41,
"step": 219
},
{
"epoch": 1.5350152240104393,
"grad_norm": 0.4218475334427419,
"learning_rate": 6.99902217823224e-05,
"loss": 0.4099,
"step": 220
},
{
"epoch": 1.5419747716398433,
"grad_norm": 0.5384078834636578,
"learning_rate": 6.986054311763402e-05,
"loss": 0.4115,
"step": 221
},
{
"epoch": 1.5489343192692475,
"grad_norm": 0.5695524389564858,
"learning_rate": 6.973015164154326e-05,
"loss": 0.4057,
"step": 222
},
{
"epoch": 1.5558938668986517,
"grad_norm": 0.5296732996163314,
"learning_rate": 6.959905046667035e-05,
"loss": 0.4163,
"step": 223
},
{
"epoch": 1.5628534145280557,
"grad_norm": 0.5381554961428682,
"learning_rate": 6.946724272257699e-05,
"loss": 0.4125,
"step": 224
},
{
"epoch": 1.5698129621574597,
"grad_norm": 0.5541446269357019,
"learning_rate": 6.933473155569165e-05,
"loss": 0.4166,
"step": 225
},
{
"epoch": 1.576772509786864,
"grad_norm": 0.5249105733906559,
"learning_rate": 6.920152012923446e-05,
"loss": 0.4159,
"step": 226
},
{
"epoch": 1.583732057416268,
"grad_norm": 0.4702525223746907,
"learning_rate": 6.906761162314165e-05,
"loss": 0.4081,
"step": 227
},
{
"epoch": 1.590691605045672,
"grad_norm": 0.4018848611353379,
"learning_rate": 6.893300923398974e-05,
"loss": 0.4095,
"step": 228
},
{
"epoch": 1.597651152675076,
"grad_norm": 0.4387137463620228,
"learning_rate": 6.879771617491912e-05,
"loss": 0.4038,
"step": 229
},
{
"epoch": 1.6046107003044803,
"grad_norm": 0.5018137502153527,
"learning_rate": 6.866173567555743e-05,
"loss": 0.4007,
"step": 230
},
{
"epoch": 1.6115702479338843,
"grad_norm": 0.4243706106137811,
"learning_rate": 6.852507098194242e-05,
"loss": 0.4087,
"step": 231
},
{
"epoch": 1.6185297955632882,
"grad_norm": 0.3061005646641186,
"learning_rate": 6.838772535644451e-05,
"loss": 0.4062,
"step": 232
},
{
"epoch": 1.6254893431926924,
"grad_norm": 0.3094567455663344,
"learning_rate": 6.824970207768882e-05,
"loss": 0.4056,
"step": 233
},
{
"epoch": 1.6324488908220967,
"grad_norm": 0.27807602474077575,
"learning_rate": 6.811100444047704e-05,
"loss": 0.4026,
"step": 234
},
{
"epoch": 1.6394084384515006,
"grad_norm": 0.23900746560867425,
"learning_rate": 6.797163575570866e-05,
"loss": 0.4087,
"step": 235
},
{
"epoch": 1.6463679860809046,
"grad_norm": 0.29199115797045505,
"learning_rate": 6.783159935030197e-05,
"loss": 0.4027,
"step": 236
},
{
"epoch": 1.6533275337103088,
"grad_norm": 0.3184667675259614,
"learning_rate": 6.76908985671147e-05,
"loss": 0.4041,
"step": 237
},
{
"epoch": 1.660287081339713,
"grad_norm": 0.29291241249140837,
"learning_rate": 6.754953676486415e-05,
"loss": 0.4079,
"step": 238
},
{
"epoch": 1.667246628969117,
"grad_norm": 0.2840221245598121,
"learning_rate": 6.740751731804699e-05,
"loss": 0.4011,
"step": 239
},
{
"epoch": 1.674206176598521,
"grad_norm": 0.27939828055160454,
"learning_rate": 6.726484361685882e-05,
"loss": 0.4019,
"step": 240
},
{
"epoch": 1.6811657242279252,
"grad_norm": 0.256609914729704,
"learning_rate": 6.712151906711314e-05,
"loss": 0.4048,
"step": 241
},
{
"epoch": 1.6881252718573294,
"grad_norm": 0.23745107974586638,
"learning_rate": 6.697754709016009e-05,
"loss": 0.4058,
"step": 242
},
{
"epoch": 1.6950848194867334,
"grad_norm": 0.2482380115074257,
"learning_rate": 6.683293112280475e-05,
"loss": 0.3967,
"step": 243
},
{
"epoch": 1.7020443671161374,
"grad_norm": 0.28622361734552443,
"learning_rate": 6.668767461722518e-05,
"loss": 0.4061,
"step": 244
},
{
"epoch": 1.7090039147455416,
"grad_norm": 0.32770363568621996,
"learning_rate": 6.654178104088987e-05,
"loss": 0.4033,
"step": 245
},
{
"epoch": 1.7159634623749458,
"grad_norm": 0.3279702359191176,
"learning_rate": 6.639525387647508e-05,
"loss": 0.4059,
"step": 246
},
{
"epoch": 1.7229230100043496,
"grad_norm": 0.3773747339194689,
"learning_rate": 6.62480966217817e-05,
"loss": 0.407,
"step": 247
},
{
"epoch": 1.7298825576337538,
"grad_norm": 0.2998244869603879,
"learning_rate": 6.610031278965168e-05,
"loss": 0.4064,
"step": 248
},
{
"epoch": 1.736842105263158,
"grad_norm": 0.28510557083266236,
"learning_rate": 6.595190590788424e-05,
"loss": 0.4005,
"step": 249
},
{
"epoch": 1.743801652892562,
"grad_norm": 0.2932273577434916,
"learning_rate": 6.580287951915163e-05,
"loss": 0.4021,
"step": 250
},
{
"epoch": 1.750761200521966,
"grad_norm": 0.36298879202671214,
"learning_rate": 6.565323718091459e-05,
"loss": 0.4004,
"step": 251
},
{
"epoch": 1.7577207481513701,
"grad_norm": 0.3557701177490779,
"learning_rate": 6.550298246533735e-05,
"loss": 0.4071,
"step": 252
},
{
"epoch": 1.7646802957807743,
"grad_norm": 0.28338303699075845,
"learning_rate": 6.535211895920247e-05,
"loss": 0.4066,
"step": 253
},
{
"epoch": 1.7716398434101783,
"grad_norm": 0.24886657367644255,
"learning_rate": 6.520065026382511e-05,
"loss": 0.3955,
"step": 254
},
{
"epoch": 1.7785993910395823,
"grad_norm": 0.24062443339702383,
"learning_rate": 6.504857999496718e-05,
"loss": 0.406,
"step": 255
},
{
"epoch": 1.7855589386689865,
"grad_norm": 0.26443097823193684,
"learning_rate": 6.489591178275087e-05,
"loss": 0.4044,
"step": 256
},
{
"epoch": 1.7925184862983907,
"grad_norm": 0.3060220331852048,
"learning_rate": 6.474264927157216e-05,
"loss": 0.404,
"step": 257
},
{
"epoch": 1.7994780339277947,
"grad_norm": 0.32526179480394124,
"learning_rate": 6.45887961200137e-05,
"loss": 0.4009,
"step": 258
},
{
"epoch": 1.8064375815571987,
"grad_norm": 0.321679972746841,
"learning_rate": 6.443435600075757e-05,
"loss": 0.4056,
"step": 259
},
{
"epoch": 1.813397129186603,
"grad_norm": 0.3488328000864295,
"learning_rate": 6.42793326004975e-05,
"loss": 0.4049,
"step": 260
},
{
"epoch": 1.820356676816007,
"grad_norm": 0.34796885320723775,
"learning_rate": 6.412372961985097e-05,
"loss": 0.4048,
"step": 261
},
{
"epoch": 1.827316224445411,
"grad_norm": 0.36888125998311183,
"learning_rate": 6.396755077327081e-05,
"loss": 0.4132,
"step": 262
},
{
"epoch": 1.834275772074815,
"grad_norm": 0.42541614404670686,
"learning_rate": 6.381079978895654e-05,
"loss": 0.4026,
"step": 263
},
{
"epoch": 1.8412353197042193,
"grad_norm": 0.47952057555190714,
"learning_rate": 6.365348040876538e-05,
"loss": 0.4081,
"step": 264
},
{
"epoch": 1.8481948673336233,
"grad_norm": 0.456755628226278,
"learning_rate": 6.349559638812292e-05,
"loss": 0.4002,
"step": 265
},
{
"epoch": 1.8551544149630272,
"grad_norm": 0.3949978072972249,
"learning_rate": 6.333715149593351e-05,
"loss": 0.4048,
"step": 266
},
{
"epoch": 1.8621139625924314,
"grad_norm": 0.3684718285439735,
"learning_rate": 6.317814951449022e-05,
"loss": 0.4044,
"step": 267
},
{
"epoch": 1.8690735102218357,
"grad_norm": 0.28515408422430655,
"learning_rate": 6.301859423938463e-05,
"loss": 0.4021,
"step": 268
},
{
"epoch": 1.8760330578512396,
"grad_norm": 0.31596820921849417,
"learning_rate": 6.285848947941612e-05,
"loss": 0.3998,
"step": 269
},
{
"epoch": 1.8829926054806436,
"grad_norm": 0.3692222807296291,
"learning_rate": 6.26978390565011e-05,
"loss": 0.4061,
"step": 270
},
{
"epoch": 1.8899521531100478,
"grad_norm": 0.36126506035909883,
"learning_rate": 6.253664680558164e-05,
"loss": 0.4081,
"step": 271
},
{
"epoch": 1.896911700739452,
"grad_norm": 0.2273722979571688,
"learning_rate": 6.237491657453396e-05,
"loss": 0.4013,
"step": 272
},
{
"epoch": 1.903871248368856,
"grad_norm": 0.2249012460180903,
"learning_rate": 6.221265222407663e-05,
"loss": 0.4073,
"step": 273
},
{
"epoch": 1.91083079599826,
"grad_norm": 0.26278543615349254,
"learning_rate": 6.204985762767835e-05,
"loss": 0.3981,
"step": 274
},
{
"epoch": 1.9177903436276642,
"grad_norm": 0.23928206927128529,
"learning_rate": 6.188653667146551e-05,
"loss": 0.4005,
"step": 275
},
{
"epoch": 1.9247498912570684,
"grad_norm": 0.28051087087380006,
"learning_rate": 6.172269325412941e-05,
"loss": 0.4047,
"step": 276
},
{
"epoch": 1.9317094388864724,
"grad_norm": 0.2991219685083212,
"learning_rate": 6.15583312868332e-05,
"loss": 0.4093,
"step": 277
},
{
"epoch": 1.9386689865158764,
"grad_norm": 0.3242054335075169,
"learning_rate": 6.139345469311855e-05,
"loss": 0.4114,
"step": 278
},
{
"epoch": 1.9456285341452806,
"grad_norm": 0.3988144729091857,
"learning_rate": 6.122806740881191e-05,
"loss": 0.4081,
"step": 279
},
{
"epoch": 1.9525880817746848,
"grad_norm": 0.5183725945183102,
"learning_rate": 6.10621733819306e-05,
"loss": 0.4048,
"step": 280
},
{
"epoch": 1.9595476294040888,
"grad_norm": 0.6027727793849302,
"learning_rate": 6.089577657258863e-05,
"loss": 0.3972,
"step": 281
},
{
"epoch": 1.9665071770334928,
"grad_norm": 0.6004923341243484,
"learning_rate": 6.0728880952902056e-05,
"loss": 0.3993,
"step": 282
},
{
"epoch": 1.973466724662897,
"grad_norm": 0.5380310616547116,
"learning_rate": 6.056149050689419e-05,
"loss": 0.3982,
"step": 283
},
{
"epoch": 1.980426272292301,
"grad_norm": 0.3634273851516844,
"learning_rate": 6.039360923040059e-05,
"loss": 0.4051,
"step": 284
},
{
"epoch": 1.987385819921705,
"grad_norm": 0.24841398249046384,
"learning_rate": 6.0225241130973506e-05,
"loss": 0.4044,
"step": 285
},
{
"epoch": 1.9943453675511091,
"grad_norm": 0.31467866229451,
"learning_rate": 6.0056390227786366e-05,
"loss": 0.4052,
"step": 286
},
{
"epoch": 2.005219660722053,
"grad_norm": 0.3996733942417249,
"learning_rate": 5.9887060551537774e-05,
"loss": 0.3765,
"step": 287
},
{
"epoch": 2.012179208351457,
"grad_norm": 0.3755337827846155,
"learning_rate": 5.971725614435529e-05,
"loss": 0.367,
"step": 288
},
{
"epoch": 2.0191387559808613,
"grad_norm": 0.37714284296853545,
"learning_rate": 5.95469810596989e-05,
"loss": 0.372,
"step": 289
},
{
"epoch": 2.0260983036102655,
"grad_norm": 0.34335888494135713,
"learning_rate": 5.937623936226435e-05,
"loss": 0.3655,
"step": 290
},
{
"epoch": 2.0330578512396693,
"grad_norm": 0.3214731030718956,
"learning_rate": 5.9205035127886026e-05,
"loss": 0.3596,
"step": 291
},
{
"epoch": 2.0400173988690735,
"grad_norm": 0.3037344199720621,
"learning_rate": 5.903337244343972e-05,
"loss": 0.365,
"step": 292
},
{
"epoch": 2.0469769464984777,
"grad_norm": 0.29558276490238256,
"learning_rate": 5.8861255406745e-05,
"loss": 0.3655,
"step": 293
},
{
"epoch": 2.0539364941278815,
"grad_norm": 0.3239986635139212,
"learning_rate": 5.8688688126467514e-05,
"loss": 0.3737,
"step": 294
},
{
"epoch": 2.0608960417572857,
"grad_norm": 0.5323428158934608,
"learning_rate": 5.8515674722020745e-05,
"loss": 0.3691,
"step": 295
},
{
"epoch": 2.06785558938669,
"grad_norm": 0.6310404829026035,
"learning_rate": 5.834221932346781e-05,
"loss": 0.3742,
"step": 296
},
{
"epoch": 2.074815137016094,
"grad_norm": 0.4806664035734928,
"learning_rate": 5.8168326071422815e-05,
"loss": 0.3655,
"step": 297
},
{
"epoch": 2.081774684645498,
"grad_norm": 0.3081849802706308,
"learning_rate": 5.799399911695201e-05,
"loss": 0.3759,
"step": 298
},
{
"epoch": 2.088734232274902,
"grad_norm": 0.384198968174172,
"learning_rate": 5.781924262147471e-05,
"loss": 0.3618,
"step": 299
},
{
"epoch": 2.0956937799043063,
"grad_norm": 0.3402446023172264,
"learning_rate": 5.7644060756663954e-05,
"loss": 0.3706,
"step": 300
},
{
"epoch": 2.1026533275337105,
"grad_norm": 0.2804847723920022,
"learning_rate": 5.746845770434692e-05,
"loss": 0.3645,
"step": 301
},
{
"epoch": 2.1096128751631142,
"grad_norm": 0.33735637577255784,
"learning_rate": 5.7292437656405094e-05,
"loss": 0.3694,
"step": 302
},
{
"epoch": 2.1165724227925184,
"grad_norm": 0.29830282676341935,
"learning_rate": 5.711600481467422e-05,
"loss": 0.3661,
"step": 303
},
{
"epoch": 2.1235319704219227,
"grad_norm": 0.26892805785374885,
"learning_rate": 5.693916339084397e-05,
"loss": 0.365,
"step": 304
},
{
"epoch": 2.130491518051327,
"grad_norm": 0.2924956628530246,
"learning_rate": 5.676191760635744e-05,
"loss": 0.3682,
"step": 305
},
{
"epoch": 2.1374510656807306,
"grad_norm": 0.3322840841302584,
"learning_rate": 5.6584271692310345e-05,
"loss": 0.3591,
"step": 306
},
{
"epoch": 2.144410613310135,
"grad_norm": 0.31939252409902796,
"learning_rate": 5.640622988935006e-05,
"loss": 0.366,
"step": 307
},
{
"epoch": 2.151370160939539,
"grad_norm": 0.3675500408960479,
"learning_rate": 5.6227796447574296e-05,
"loss": 0.3721,
"step": 308
},
{
"epoch": 2.1583297085689432,
"grad_norm": 0.4269918712695122,
"learning_rate": 5.604897562642979e-05,
"loss": 0.3691,
"step": 309
},
{
"epoch": 2.165289256198347,
"grad_norm": 0.32932381192451604,
"learning_rate": 5.58697716946105e-05,
"loss": 0.3695,
"step": 310
},
{
"epoch": 2.172248803827751,
"grad_norm": 0.23672816365722077,
"learning_rate": 5.5690188929955756e-05,
"loss": 0.3718,
"step": 311
},
{
"epoch": 2.1792083514571554,
"grad_norm": 0.2685885863825375,
"learning_rate": 5.5510231619348154e-05,
"loss": 0.3626,
"step": 312
},
{
"epoch": 2.186167899086559,
"grad_norm": 0.31894739343841294,
"learning_rate": 5.5329904058611195e-05,
"loss": 0.3696,
"step": 313
},
{
"epoch": 2.1931274467159634,
"grad_norm": 0.2690402159463182,
"learning_rate": 5.514921055240674e-05,
"loss": 0.3664,
"step": 314
},
{
"epoch": 2.2000869943453676,
"grad_norm": 0.36597319341247037,
"learning_rate": 5.4968155414132294e-05,
"loss": 0.3661,
"step": 315
},
{
"epoch": 2.207046541974772,
"grad_norm": 0.2509160139038948,
"learning_rate": 5.4786742965817964e-05,
"loss": 0.3737,
"step": 316
},
{
"epoch": 2.2140060896041756,
"grad_norm": 0.22135896086783516,
"learning_rate": 5.4604977538023375e-05,
"loss": 0.3651,
"step": 317
},
{
"epoch": 2.2209656372335798,
"grad_norm": 0.21117176336652171,
"learning_rate": 5.442286346973419e-05,
"loss": 0.3694,
"step": 318
},
{
"epoch": 2.227925184862984,
"grad_norm": 0.21238366241650145,
"learning_rate": 5.424040510825867e-05,
"loss": 0.3724,
"step": 319
},
{
"epoch": 2.234884732492388,
"grad_norm": 0.18338158702931512,
"learning_rate": 5.405760680912374e-05,
"loss": 0.3706,
"step": 320
},
{
"epoch": 2.241844280121792,
"grad_norm": 0.21207069234590878,
"learning_rate": 5.387447293597113e-05,
"loss": 0.3612,
"step": 321
},
{
"epoch": 2.248803827751196,
"grad_norm": 0.2196618454401931,
"learning_rate": 5.3691007860453185e-05,
"loss": 0.3706,
"step": 322
},
{
"epoch": 2.2557633753806003,
"grad_norm": 0.28738964583045895,
"learning_rate": 5.3507215962128485e-05,
"loss": 0.3665,
"step": 323
},
{
"epoch": 2.2627229230100045,
"grad_norm": 0.2285542763691958,
"learning_rate": 5.332310162835729e-05,
"loss": 0.371,
"step": 324
},
{
"epoch": 2.2696824706394083,
"grad_norm": 0.25316534475803026,
"learning_rate": 5.313866925419685e-05,
"loss": 0.368,
"step": 325
},
{
"epoch": 2.2766420182688125,
"grad_norm": 0.30304672804412763,
"learning_rate": 5.295392324229648e-05,
"loss": 0.3681,
"step": 326
},
{
"epoch": 2.2836015658982167,
"grad_norm": 0.31451674332345964,
"learning_rate": 5.276886800279243e-05,
"loss": 0.367,
"step": 327
},
{
"epoch": 2.2905611135276205,
"grad_norm": 0.3565352812109271,
"learning_rate": 5.2583507953202654e-05,
"loss": 0.3689,
"step": 328
},
{
"epoch": 2.2975206611570247,
"grad_norm": 0.2608640444128353,
"learning_rate": 5.239784751832128e-05,
"loss": 0.3708,
"step": 329
},
{
"epoch": 2.304480208786429,
"grad_norm": 0.17931678523987182,
"learning_rate": 5.221189113011309e-05,
"loss": 0.3681,
"step": 330
},
{
"epoch": 2.311439756415833,
"grad_norm": 0.24872188995030328,
"learning_rate": 5.2025643227607656e-05,
"loss": 0.366,
"step": 331
},
{
"epoch": 2.3183993040452373,
"grad_norm": 0.2145763484568399,
"learning_rate": 5.18391082567934e-05,
"loss": 0.3608,
"step": 332
},
{
"epoch": 2.325358851674641,
"grad_norm": 0.2164912354682701,
"learning_rate": 5.1652290670511396e-05,
"loss": 0.3715,
"step": 333
},
{
"epoch": 2.3323183993040453,
"grad_norm": 0.2248326733820592,
"learning_rate": 5.1465194928349215e-05,
"loss": 0.3723,
"step": 334
},
{
"epoch": 2.3392779469334495,
"grad_norm": 0.23611123081541333,
"learning_rate": 5.127782549653431e-05,
"loss": 0.368,
"step": 335
},
{
"epoch": 2.3462374945628532,
"grad_norm": 0.27552200066945187,
"learning_rate": 5.1090186847827535e-05,
"loss": 0.3681,
"step": 336
},
{
"epoch": 2.3531970421922574,
"grad_norm": 0.2473752934953281,
"learning_rate": 5.090228346141626e-05,
"loss": 0.3705,
"step": 337
},
{
"epoch": 2.3601565898216617,
"grad_norm": 0.1964697404165815,
"learning_rate": 5.071411982280754e-05,
"loss": 0.3694,
"step": 338
},
{
"epoch": 2.367116137451066,
"grad_norm": 0.18565136863582019,
"learning_rate": 5.0525700423720964e-05,
"loss": 0.3676,
"step": 339
},
{
"epoch": 2.3740756850804696,
"grad_norm": 0.2127557318833584,
"learning_rate": 5.033702976198154e-05,
"loss": 0.3652,
"step": 340
},
{
"epoch": 2.381035232709874,
"grad_norm": 0.22768416565210678,
"learning_rate": 5.0148112341412155e-05,
"loss": 0.3627,
"step": 341
},
{
"epoch": 2.387994780339278,
"grad_norm": 0.20405791268703083,
"learning_rate": 4.9958952671726214e-05,
"loss": 0.3645,
"step": 342
},
{
"epoch": 2.394954327968682,
"grad_norm": 0.18709352719221842,
"learning_rate": 4.976955526841995e-05,
"loss": 0.3744,
"step": 343
},
{
"epoch": 2.401913875598086,
"grad_norm": 0.2257696533468009,
"learning_rate": 4.9579924652664624e-05,
"loss": 0.3659,
"step": 344
},
{
"epoch": 2.40887342322749,
"grad_norm": 0.22579970351795095,
"learning_rate": 4.939006535119851e-05,
"loss": 0.3721,
"step": 345
},
{
"epoch": 2.4158329708568944,
"grad_norm": 0.1695044273561281,
"learning_rate": 4.919998189621902e-05,
"loss": 0.3717,
"step": 346
},
{
"epoch": 2.4227925184862986,
"grad_norm": 0.1868306496768378,
"learning_rate": 4.9009678825274344e-05,
"loss": 0.37,
"step": 347
},
{
"epoch": 2.4297520661157024,
"grad_norm": 0.203208513247505,
"learning_rate": 4.8819160681155245e-05,
"loss": 0.3687,
"step": 348
},
{
"epoch": 2.4367116137451066,
"grad_norm": 0.18225921597742265,
"learning_rate": 4.8628432011786536e-05,
"loss": 0.3722,
"step": 349
},
{
"epoch": 2.443671161374511,
"grad_norm": 0.19704873995226552,
"learning_rate": 4.843749737011858e-05,
"loss": 0.3767,
"step": 350
},
{
"epoch": 2.4506307090039146,
"grad_norm": 0.17968261581935796,
"learning_rate": 4.8246361314018566e-05,
"loss": 0.3674,
"step": 351
},
{
"epoch": 2.4575902566333188,
"grad_norm": 0.16849328618057585,
"learning_rate": 4.805502840616171e-05,
"loss": 0.3676,
"step": 352
},
{
"epoch": 2.464549804262723,
"grad_norm": 0.17392306484481443,
"learning_rate": 4.786350321392237e-05,
"loss": 0.3598,
"step": 353
},
{
"epoch": 2.471509351892127,
"grad_norm": 0.15047667342669488,
"learning_rate": 4.767179030926492e-05,
"loss": 0.3626,
"step": 354
},
{
"epoch": 2.478468899521531,
"grad_norm": 0.18466737228109523,
"learning_rate": 4.7479894268634794e-05,
"loss": 0.3644,
"step": 355
},
{
"epoch": 2.485428447150935,
"grad_norm": 0.19259024117568577,
"learning_rate": 4.728781967284904e-05,
"loss": 0.3666,
"step": 356
},
{
"epoch": 2.4923879947803393,
"grad_norm": 0.1658133386266871,
"learning_rate": 4.7095571106987096e-05,
"loss": 0.3706,
"step": 357
},
{
"epoch": 2.4993475424097435,
"grad_norm": 0.1857293176933773,
"learning_rate": 4.6903153160281266e-05,
"loss": 0.3658,
"step": 358
},
{
"epoch": 2.5063070900391473,
"grad_norm": 0.1780806265365307,
"learning_rate": 4.671057042600728e-05,
"loss": 0.37,
"step": 359
},
{
"epoch": 2.5132666376685515,
"grad_norm": 0.18451738808389523,
"learning_rate": 4.6517827501374466e-05,
"loss": 0.367,
"step": 360
},
{
"epoch": 2.5202261852979557,
"grad_norm": 0.19964181415078697,
"learning_rate": 4.632492898741619e-05,
"loss": 0.3679,
"step": 361
},
{
"epoch": 2.52718573292736,
"grad_norm": 0.20121807599019395,
"learning_rate": 4.61318794888799e-05,
"loss": 0.3664,
"step": 362
},
{
"epoch": 2.5341452805567637,
"grad_norm": 0.1711607561791567,
"learning_rate": 4.593868361411729e-05,
"loss": 0.3719,
"step": 363
},
{
"epoch": 2.541104828186168,
"grad_norm": 0.20920063909952305,
"learning_rate": 4.57453459749742e-05,
"loss": 0.3677,
"step": 364
},
{
"epoch": 2.548064375815572,
"grad_norm": 0.20148754034737523,
"learning_rate": 4.555187118668064e-05,
"loss": 0.3715,
"step": 365
},
{
"epoch": 2.555023923444976,
"grad_norm": 0.16950935825604083,
"learning_rate": 4.53582638677405e-05,
"loss": 0.3668,
"step": 366
},
{
"epoch": 2.56198347107438,
"grad_norm": 0.18835333573041302,
"learning_rate": 4.516452863982138e-05,
"loss": 0.3642,
"step": 367
},
{
"epoch": 2.5689430187037843,
"grad_norm": 0.19215610553252904,
"learning_rate": 4.497067012764423e-05,
"loss": 0.3691,
"step": 368
},
{
"epoch": 2.5759025663331885,
"grad_norm": 0.14122968503331657,
"learning_rate": 4.477669295887299e-05,
"loss": 0.3682,
"step": 369
},
{
"epoch": 2.5828621139625927,
"grad_norm": 0.21501799625798765,
"learning_rate": 4.458260176400404e-05,
"loss": 0.3778,
"step": 370
},
{
"epoch": 2.5898216615919964,
"grad_norm": 0.19997905382170258,
"learning_rate": 4.4388401176255765e-05,
"loss": 0.367,
"step": 371
},
{
"epoch": 2.5967812092214007,
"grad_norm": 0.16385556465949214,
"learning_rate": 4.419409583145787e-05,
"loss": 0.3671,
"step": 372
},
{
"epoch": 2.603740756850805,
"grad_norm": 0.21677818126974885,
"learning_rate": 4.3999690367940796e-05,
"loss": 0.3685,
"step": 373
},
{
"epoch": 2.6107003044802086,
"grad_norm": 0.20763772842399864,
"learning_rate": 4.3805189426424895e-05,
"loss": 0.3637,
"step": 374
},
{
"epoch": 2.617659852109613,
"grad_norm": 0.15620005918797172,
"learning_rate": 4.361059764990977e-05,
"loss": 0.3612,
"step": 375
},
{
"epoch": 2.624619399739017,
"grad_norm": 0.21129929527164582,
"learning_rate": 4.341591968356332e-05,
"loss": 0.36,
"step": 376
},
{
"epoch": 2.6315789473684212,
"grad_norm": 0.2263848905453181,
"learning_rate": 4.322116017461094e-05,
"loss": 0.367,
"step": 377
},
{
"epoch": 2.638538494997825,
"grad_norm": 0.19202782447376043,
"learning_rate": 4.3026323772224564e-05,
"loss": 0.3741,
"step": 378
},
{
"epoch": 2.645498042627229,
"grad_norm": 0.21118865625881889,
"learning_rate": 4.283141512741168e-05,
"loss": 0.3613,
"step": 379
},
{
"epoch": 2.6524575902566334,
"grad_norm": 0.22052270512412744,
"learning_rate": 4.263643889290425e-05,
"loss": 0.3772,
"step": 380
},
{
"epoch": 2.659417137886037,
"grad_norm": 0.22167433442606896,
"learning_rate": 4.244139972304775e-05,
"loss": 0.3679,
"step": 381
},
{
"epoch": 2.6663766855154414,
"grad_norm": 0.1649098717811062,
"learning_rate": 4.224630227368998e-05,
"loss": 0.37,
"step": 382
},
{
"epoch": 2.6733362331448456,
"grad_norm": 0.16002894116004632,
"learning_rate": 4.2051151202069976e-05,
"loss": 0.3687,
"step": 383
},
{
"epoch": 2.68029578077425,
"grad_norm": 0.20116234602993452,
"learning_rate": 4.1855951166706783e-05,
"loss": 0.3658,
"step": 384
},
{
"epoch": 2.687255328403654,
"grad_norm": 0.1867737945734333,
"learning_rate": 4.166070682728826e-05,
"loss": 0.3636,
"step": 385
},
{
"epoch": 2.6942148760330578,
"grad_norm": 0.20649674616661776,
"learning_rate": 4.1465422844559914e-05,
"loss": 0.369,
"step": 386
},
{
"epoch": 2.701174423662462,
"grad_norm": 0.2019593756587488,
"learning_rate": 4.127010388021355e-05,
"loss": 0.3707,
"step": 387
},
{
"epoch": 2.708133971291866,
"grad_norm": 0.18670876706954212,
"learning_rate": 4.1074754596776076e-05,
"loss": 0.3723,
"step": 388
},
{
"epoch": 2.71509351892127,
"grad_norm": 0.1760226934500404,
"learning_rate": 4.087937965749808e-05,
"loss": 0.3652,
"step": 389
},
{
"epoch": 2.722053066550674,
"grad_norm": 0.19645887620326913,
"learning_rate": 4.068398372624268e-05,
"loss": 0.3599,
"step": 390
},
{
"epoch": 2.7290126141800783,
"grad_norm": 0.19578274938214618,
"learning_rate": 4.0488571467374035e-05,
"loss": 0.3581,
"step": 391
},
{
"epoch": 2.7359721618094826,
"grad_norm": 0.18492474565139713,
"learning_rate": 4.02931475456461e-05,
"loss": 0.3685,
"step": 392
},
{
"epoch": 2.7429317094388863,
"grad_norm": 0.19837302215210767,
"learning_rate": 4.009771662609122e-05,
"loss": 0.3672,
"step": 393
},
{
"epoch": 2.7498912570682905,
"grad_norm": 0.19983845339321854,
"learning_rate": 3.990228337390879e-05,
"loss": 0.37,
"step": 394
},
{
"epoch": 2.7568508046976947,
"grad_norm": 0.21784022283680937,
"learning_rate": 3.970685245435391e-05,
"loss": 0.3654,
"step": 395
},
{
"epoch": 2.7638103523270985,
"grad_norm": 0.1710051588696074,
"learning_rate": 3.951142853262598e-05,
"loss": 0.3672,
"step": 396
},
{
"epoch": 2.7707698999565027,
"grad_norm": 0.1838813552978722,
"learning_rate": 3.931601627375733e-05,
"loss": 0.3657,
"step": 397
},
{
"epoch": 2.777729447585907,
"grad_norm": 0.16862615564852854,
"learning_rate": 3.9120620342501934e-05,
"loss": 0.3638,
"step": 398
},
{
"epoch": 2.784688995215311,
"grad_norm": 0.1799106397970807,
"learning_rate": 3.8925245403223944e-05,
"loss": 0.3643,
"step": 399
},
{
"epoch": 2.7916485428447153,
"grad_norm": 0.15331361673144303,
"learning_rate": 3.872989611978644e-05,
"loss": 0.3629,
"step": 400
},
{
"epoch": 2.798608090474119,
"grad_norm": 0.15271887998704453,
"learning_rate": 3.85345771554401e-05,
"loss": 0.3652,
"step": 401
},
{
"epoch": 2.8055676381035233,
"grad_norm": 0.15620132253760882,
"learning_rate": 3.833929317271175e-05,
"loss": 0.3602,
"step": 402
},
{
"epoch": 2.8125271857329275,
"grad_norm": 0.15541759415132128,
"learning_rate": 3.814404883329324e-05,
"loss": 0.3696,
"step": 403
},
{
"epoch": 2.8194867333623312,
"grad_norm": 0.1614291271877618,
"learning_rate": 3.794884879793004e-05,
"loss": 0.3657,
"step": 404
},
{
"epoch": 2.8264462809917354,
"grad_norm": 0.13456071859787921,
"learning_rate": 3.7753697726310026e-05,
"loss": 0.3646,
"step": 405
},
{
"epoch": 2.8334058286211397,
"grad_norm": 0.1451353124821423,
"learning_rate": 3.755860027695225e-05,
"loss": 0.3706,
"step": 406
},
{
"epoch": 2.840365376250544,
"grad_norm": 0.14928014331184716,
"learning_rate": 3.7363561107095765e-05,
"loss": 0.3677,
"step": 407
},
{
"epoch": 2.847324923879948,
"grad_norm": 0.13937655853284261,
"learning_rate": 3.7168584872588336e-05,
"loss": 0.3642,
"step": 408
},
{
"epoch": 2.854284471509352,
"grad_norm": 0.16336855650022625,
"learning_rate": 3.697367622777545e-05,
"loss": 0.3632,
"step": 409
},
{
"epoch": 2.861244019138756,
"grad_norm": 0.1545871776685994,
"learning_rate": 3.677883982538907e-05,
"loss": 0.3703,
"step": 410
},
{
"epoch": 2.86820356676816,
"grad_norm": 0.14861237645334818,
"learning_rate": 3.6584080316436696e-05,
"loss": 0.3632,
"step": 411
},
{
"epoch": 2.875163114397564,
"grad_norm": 0.1386742246058214,
"learning_rate": 3.638940235009025e-05,
"loss": 0.3691,
"step": 412
},
{
"epoch": 2.882122662026968,
"grad_norm": 0.1496262020436169,
"learning_rate": 3.619481057357511e-05,
"loss": 0.3649,
"step": 413
},
{
"epoch": 2.8890822096563724,
"grad_norm": 0.1274326314422379,
"learning_rate": 3.600030963205922e-05,
"loss": 0.3702,
"step": 414
},
{
"epoch": 2.8960417572857766,
"grad_norm": 0.14829018850055273,
"learning_rate": 3.580590416854214e-05,
"loss": 0.3641,
"step": 415
},
{
"epoch": 2.9030013049151804,
"grad_norm": 0.15397049410838218,
"learning_rate": 3.561159882374425e-05,
"loss": 0.3655,
"step": 416
},
{
"epoch": 2.9099608525445846,
"grad_norm": 0.149564207070461,
"learning_rate": 3.541739823599598e-05,
"loss": 0.3638,
"step": 417
},
{
"epoch": 2.916920400173989,
"grad_norm": 0.14607395090587116,
"learning_rate": 3.5223307041127025e-05,
"loss": 0.3675,
"step": 418
},
{
"epoch": 2.9238799478033926,
"grad_norm": 0.17923632050969646,
"learning_rate": 3.502932987235577e-05,
"loss": 0.369,
"step": 419
},
{
"epoch": 2.9308394954327968,
"grad_norm": 0.1538306766022074,
"learning_rate": 3.4835471360178626e-05,
"loss": 0.369,
"step": 420
},
{
"epoch": 2.937799043062201,
"grad_norm": 0.17364066496444236,
"learning_rate": 3.464173613225951e-05,
"loss": 0.3678,
"step": 421
},
{
"epoch": 2.944758590691605,
"grad_norm": 0.14792158024656513,
"learning_rate": 3.4448128813319365e-05,
"loss": 0.3706,
"step": 422
},
{
"epoch": 2.9517181383210094,
"grad_norm": 0.16703857041888837,
"learning_rate": 3.425465402502581e-05,
"loss": 0.3668,
"step": 423
},
{
"epoch": 2.958677685950413,
"grad_norm": 0.16247636991051975,
"learning_rate": 3.406131638588273e-05,
"loss": 0.3613,
"step": 424
},
{
"epoch": 2.9656372335798173,
"grad_norm": 0.14211396104389656,
"learning_rate": 3.386812051112011e-05,
"loss": 0.3678,
"step": 425
},
{
"epoch": 2.9725967812092216,
"grad_norm": 0.1664600329507516,
"learning_rate": 3.367507101258382e-05,
"loss": 0.359,
"step": 426
},
{
"epoch": 2.9795563288386253,
"grad_norm": 0.15867128125681806,
"learning_rate": 3.348217249862555e-05,
"loss": 0.3749,
"step": 427
},
{
"epoch": 2.9865158764680295,
"grad_norm": 0.12942815057080284,
"learning_rate": 3.328942957399274e-05,
"loss": 0.3692,
"step": 428
},
{
"epoch": 2.9934754240974337,
"grad_norm": 0.14130545610018397,
"learning_rate": 3.309684683971874e-05,
"loss": 0.3673,
"step": 429
},
{
"epoch": 3.0043497172683775,
"grad_norm": 0.16060728870914467,
"learning_rate": 3.2904428893012924e-05,
"loss": 0.3474,
"step": 430
},
{
"epoch": 3.0113092648977817,
"grad_norm": 0.1671503265033675,
"learning_rate": 3.2712180327150965e-05,
"loss": 0.3352,
"step": 431
},
{
"epoch": 3.018268812527186,
"grad_norm": 0.17602527650846453,
"learning_rate": 3.252010573136521e-05,
"loss": 0.3334,
"step": 432
},
{
"epoch": 3.0252283601565897,
"grad_norm": 0.19388540072595992,
"learning_rate": 3.2328209690735085e-05,
"loss": 0.3368,
"step": 433
},
{
"epoch": 3.032187907785994,
"grad_norm": 0.18871272314861076,
"learning_rate": 3.213649678607765e-05,
"loss": 0.3276,
"step": 434
},
{
"epoch": 3.039147455415398,
"grad_norm": 0.1825751881244417,
"learning_rate": 3.19449715938383e-05,
"loss": 0.3264,
"step": 435
},
{
"epoch": 3.046107003044802,
"grad_norm": 0.20536550174110904,
"learning_rate": 3.175363868598145e-05,
"loss": 0.3336,
"step": 436
},
{
"epoch": 3.053066550674206,
"grad_norm": 0.19028693715972822,
"learning_rate": 3.1562502629881435e-05,
"loss": 0.3361,
"step": 437
},
{
"epoch": 3.0600260983036103,
"grad_norm": 0.19144645634840593,
"learning_rate": 3.137156798821347e-05,
"loss": 0.3295,
"step": 438
},
{
"epoch": 3.0669856459330145,
"grad_norm": 0.15458500965875127,
"learning_rate": 3.118083931884477e-05,
"loss": 0.3325,
"step": 439
},
{
"epoch": 3.0739451935624182,
"grad_norm": 0.16190332070850957,
"learning_rate": 3.099032117472567e-05,
"loss": 0.324,
"step": 440
},
{
"epoch": 3.0809047411918224,
"grad_norm": 0.1931582699954326,
"learning_rate": 3.0800018103780997e-05,
"loss": 0.3319,
"step": 441
},
{
"epoch": 3.0878642888212267,
"grad_norm": 0.15544825033730889,
"learning_rate": 3.060993464880151e-05,
"loss": 0.3312,
"step": 442
},
{
"epoch": 3.094823836450631,
"grad_norm": 0.19712552027556793,
"learning_rate": 3.0420075347335403e-05,
"loss": 0.3358,
"step": 443
},
{
"epoch": 3.1017833840800346,
"grad_norm": 0.16525888661776322,
"learning_rate": 3.023044473158004e-05,
"loss": 0.3286,
"step": 444
},
{
"epoch": 3.108742931709439,
"grad_norm": 0.16251875749806674,
"learning_rate": 3.0041047328273786e-05,
"loss": 0.3371,
"step": 445
},
{
"epoch": 3.115702479338843,
"grad_norm": 0.14251069293212443,
"learning_rate": 2.9851887658587865e-05,
"loss": 0.3323,
"step": 446
},
{
"epoch": 3.1226620269682472,
"grad_norm": 0.14225759957337078,
"learning_rate": 2.9662970238018472e-05,
"loss": 0.3323,
"step": 447
},
{
"epoch": 3.129621574597651,
"grad_norm": 0.12972258276234128,
"learning_rate": 2.947429957627904e-05,
"loss": 0.3289,
"step": 448
},
{
"epoch": 3.136581122227055,
"grad_norm": 0.13682940985467507,
"learning_rate": 2.9285880177192475e-05,
"loss": 0.3265,
"step": 449
},
{
"epoch": 3.1435406698564594,
"grad_norm": 0.12277970656857606,
"learning_rate": 2.9097716538583746e-05,
"loss": 0.3282,
"step": 450
},
{
"epoch": 3.1505002174858636,
"grad_norm": 0.1543558782856671,
"learning_rate": 2.8909813152172472e-05,
"loss": 0.3335,
"step": 451
},
{
"epoch": 3.1574597651152674,
"grad_norm": 0.14314974220458332,
"learning_rate": 2.8722174503465697e-05,
"loss": 0.3367,
"step": 452
},
{
"epoch": 3.1644193127446716,
"grad_norm": 0.133141477333654,
"learning_rate": 2.8534805071650802e-05,
"loss": 0.3306,
"step": 453
},
{
"epoch": 3.171378860374076,
"grad_norm": 0.15733593589203457,
"learning_rate": 2.834770932948862e-05,
"loss": 0.334,
"step": 454
},
{
"epoch": 3.17833840800348,
"grad_norm": 0.14584273626489633,
"learning_rate": 2.816089174320663e-05,
"loss": 0.3325,
"step": 455
},
{
"epoch": 3.1852979556328838,
"grad_norm": 0.14481726585440186,
"learning_rate": 2.7974356772392347e-05,
"loss": 0.3381,
"step": 456
},
{
"epoch": 3.192257503262288,
"grad_norm": 0.16331363108391558,
"learning_rate": 2.7788108869886917e-05,
"loss": 0.334,
"step": 457
},
{
"epoch": 3.199217050891692,
"grad_norm": 0.11664377872886357,
"learning_rate": 2.7602152481678726e-05,
"loss": 0.3308,
"step": 458
},
{
"epoch": 3.206176598521096,
"grad_norm": 0.14495158600624636,
"learning_rate": 2.741649204679736e-05,
"loss": 0.3336,
"step": 459
},
{
"epoch": 3.2131361461505,
"grad_norm": 0.1229178497126909,
"learning_rate": 2.723113199720757e-05,
"loss": 0.3386,
"step": 460
},
{
"epoch": 3.2200956937799043,
"grad_norm": 0.12926743116724196,
"learning_rate": 2.7046076757703524e-05,
"loss": 0.3358,
"step": 461
},
{
"epoch": 3.2270552414093086,
"grad_norm": 0.1313312996235397,
"learning_rate": 2.6861330745803167e-05,
"loss": 0.3397,
"step": 462
},
{
"epoch": 3.2340147890387123,
"grad_norm": 0.1309779972337031,
"learning_rate": 2.6676898371642726e-05,
"loss": 0.3338,
"step": 463
},
{
"epoch": 3.2409743366681165,
"grad_norm": 0.11811653221059722,
"learning_rate": 2.6492784037871532e-05,
"loss": 0.3316,
"step": 464
},
{
"epoch": 3.2479338842975207,
"grad_norm": 0.1434333839230034,
"learning_rate": 2.6308992139546825e-05,
"loss": 0.3348,
"step": 465
},
{
"epoch": 3.254893431926925,
"grad_norm": 0.11122333012963419,
"learning_rate": 2.6125527064028874e-05,
"loss": 0.3351,
"step": 466
},
{
"epoch": 3.2618529795563287,
"grad_norm": 0.1403831175014317,
"learning_rate": 2.5942393190876268e-05,
"loss": 0.3301,
"step": 467
},
{
"epoch": 3.268812527185733,
"grad_norm": 0.11875350967381776,
"learning_rate": 2.5759594891741345e-05,
"loss": 0.3361,
"step": 468
},
{
"epoch": 3.275772074815137,
"grad_norm": 0.1352653946139823,
"learning_rate": 2.55771365302658e-05,
"loss": 0.3293,
"step": 469
},
{
"epoch": 3.2827316224445413,
"grad_norm": 0.12259046772867968,
"learning_rate": 2.539502246197663e-05,
"loss": 0.3317,
"step": 470
},
{
"epoch": 3.289691170073945,
"grad_norm": 0.1341784005406857,
"learning_rate": 2.5213257034182042e-05,
"loss": 0.3336,
"step": 471
},
{
"epoch": 3.2966507177033493,
"grad_norm": 0.13864288710375855,
"learning_rate": 2.503184458586772e-05,
"loss": 0.3368,
"step": 472
},
{
"epoch": 3.3036102653327535,
"grad_norm": 0.13118987129584506,
"learning_rate": 2.4850789447593276e-05,
"loss": 0.3367,
"step": 473
},
{
"epoch": 3.3105698129621572,
"grad_norm": 0.1377873050994517,
"learning_rate": 2.4670095941388822e-05,
"loss": 0.3388,
"step": 474
},
{
"epoch": 3.3175293605915614,
"grad_norm": 0.12322175327795074,
"learning_rate": 2.4489768380651856e-05,
"loss": 0.3333,
"step": 475
},
{
"epoch": 3.3244889082209657,
"grad_norm": 0.13741800125839718,
"learning_rate": 2.4309811070044247e-05,
"loss": 0.3327,
"step": 476
},
{
"epoch": 3.33144845585037,
"grad_norm": 0.11467772477556636,
"learning_rate": 2.4130228305389514e-05,
"loss": 0.329,
"step": 477
},
{
"epoch": 3.3384080034797736,
"grad_norm": 0.13098647128626187,
"learning_rate": 2.3951024373570214e-05,
"loss": 0.3373,
"step": 478
},
{
"epoch": 3.345367551109178,
"grad_norm": 0.10605964176743833,
"learning_rate": 2.3772203552425717e-05,
"loss": 0.3276,
"step": 479
},
{
"epoch": 3.352327098738582,
"grad_norm": 0.1369975822039289,
"learning_rate": 2.3593770110649966e-05,
"loss": 0.3287,
"step": 480
},
{
"epoch": 3.3592866463679862,
"grad_norm": 0.10746583370109318,
"learning_rate": 2.341572830768965e-05,
"loss": 0.3247,
"step": 481
},
{
"epoch": 3.36624619399739,
"grad_norm": 0.12619643852285148,
"learning_rate": 2.323808239364256e-05,
"loss": 0.3334,
"step": 482
},
{
"epoch": 3.373205741626794,
"grad_norm": 0.11299951719427818,
"learning_rate": 2.306083660915604e-05,
"loss": 0.3314,
"step": 483
},
{
"epoch": 3.3801652892561984,
"grad_norm": 0.12698843914556834,
"learning_rate": 2.2883995185325797e-05,
"loss": 0.3269,
"step": 484
},
{
"epoch": 3.3871248368856026,
"grad_norm": 0.11590705576436211,
"learning_rate": 2.2707562343594916e-05,
"loss": 0.3378,
"step": 485
},
{
"epoch": 3.3940843845150064,
"grad_norm": 0.12503790075980037,
"learning_rate": 2.2531542295653094e-05,
"loss": 0.336,
"step": 486
},
{
"epoch": 3.4010439321444106,
"grad_norm": 0.11611542174394521,
"learning_rate": 2.235593924333607e-05,
"loss": 0.3347,
"step": 487
},
{
"epoch": 3.408003479773815,
"grad_norm": 0.12221901254166186,
"learning_rate": 2.21807573785253e-05,
"loss": 0.3333,
"step": 488
},
{
"epoch": 3.4149630274032186,
"grad_norm": 0.12496755906276005,
"learning_rate": 2.2006000883048008e-05,
"loss": 0.331,
"step": 489
},
{
"epoch": 3.4219225750326228,
"grad_norm": 0.1160790840797104,
"learning_rate": 2.183167392857719e-05,
"loss": 0.3347,
"step": 490
},
{
"epoch": 3.428882122662027,
"grad_norm": 0.12271399113397469,
"learning_rate": 2.1657780676532205e-05,
"loss": 0.3371,
"step": 491
},
{
"epoch": 3.435841670291431,
"grad_norm": 0.11513590243752209,
"learning_rate": 2.1484325277979278e-05,
"loss": 0.3336,
"step": 492
},
{
"epoch": 3.4428012179208354,
"grad_norm": 0.13194006381727105,
"learning_rate": 2.1311311873532502e-05,
"loss": 0.3346,
"step": 493
},
{
"epoch": 3.449760765550239,
"grad_norm": 0.10982920455618568,
"learning_rate": 2.1138744593254997e-05,
"loss": 0.3304,
"step": 494
},
{
"epoch": 3.4567203131796433,
"grad_norm": 0.1485401459853815,
"learning_rate": 2.09666275565603e-05,
"loss": 0.3296,
"step": 495
},
{
"epoch": 3.4636798608090476,
"grad_norm": 0.12077637074742696,
"learning_rate": 2.0794964872113987e-05,
"loss": 0.3354,
"step": 496
},
{
"epoch": 3.4706394084384513,
"grad_norm": 0.12161585081850207,
"learning_rate": 2.062376063773567e-05,
"loss": 0.3273,
"step": 497
},
{
"epoch": 3.4775989560678555,
"grad_norm": 0.11631137052693763,
"learning_rate": 2.045301894030111e-05,
"loss": 0.3358,
"step": 498
},
{
"epoch": 3.4845585036972597,
"grad_norm": 0.12349941669465997,
"learning_rate": 2.0282743855644727e-05,
"loss": 0.3297,
"step": 499
},
{
"epoch": 3.491518051326664,
"grad_norm": 0.10080665328455962,
"learning_rate": 2.011293944846222e-05,
"loss": 0.3322,
"step": 500
},
{
"epoch": 3.4984775989560677,
"grad_norm": 0.12452596754190881,
"learning_rate": 1.994360977221364e-05,
"loss": 0.3378,
"step": 501
},
{
"epoch": 3.505437146585472,
"grad_norm": 0.10178078398456272,
"learning_rate": 1.97747588690265e-05,
"loss": 0.3254,
"step": 502
},
{
"epoch": 3.512396694214876,
"grad_norm": 0.11961132752342465,
"learning_rate": 1.9606390769599426e-05,
"loss": 0.3325,
"step": 503
},
{
"epoch": 3.51935624184428,
"grad_norm": 0.10380768215954524,
"learning_rate": 1.9438509493105816e-05,
"loss": 0.3301,
"step": 504
},
{
"epoch": 3.526315789473684,
"grad_norm": 0.10192138810793511,
"learning_rate": 1.9271119047097967e-05,
"loss": 0.3343,
"step": 505
},
{
"epoch": 3.5332753371030883,
"grad_norm": 0.11375055774791937,
"learning_rate": 1.910422342741136e-05,
"loss": 0.3354,
"step": 506
},
{
"epoch": 3.5402348847324925,
"grad_norm": 0.09713201709606016,
"learning_rate": 1.8937826618069396e-05,
"loss": 0.3326,
"step": 507
},
{
"epoch": 3.5471944323618967,
"grad_norm": 0.11482174079205225,
"learning_rate": 1.8771932591188106e-05,
"loss": 0.3358,
"step": 508
},
{
"epoch": 3.5541539799913004,
"grad_norm": 0.1085601568655917,
"learning_rate": 1.860654530688147e-05,
"loss": 0.3316,
"step": 509
},
{
"epoch": 3.5611135276207047,
"grad_norm": 0.11763399705418416,
"learning_rate": 1.84416687131668e-05,
"loss": 0.3296,
"step": 510
},
{
"epoch": 3.568073075250109,
"grad_norm": 0.11136798232627028,
"learning_rate": 1.8277306745870605e-05,
"loss": 0.3328,
"step": 511
},
{
"epoch": 3.5750326228795126,
"grad_norm": 0.12672402635855642,
"learning_rate": 1.811346332853451e-05,
"loss": 0.332,
"step": 512
},
{
"epoch": 3.581992170508917,
"grad_norm": 0.11623196365691729,
"learning_rate": 1.7950142372321658e-05,
"loss": 0.332,
"step": 513
},
{
"epoch": 3.588951718138321,
"grad_norm": 0.107315476806856,
"learning_rate": 1.778734777592337e-05,
"loss": 0.3317,
"step": 514
},
{
"epoch": 3.5959112657677252,
"grad_norm": 0.11801835716954666,
"learning_rate": 1.7625083425466044e-05,
"loss": 0.3339,
"step": 515
},
{
"epoch": 3.6028708133971294,
"grad_norm": 0.0966733318424344,
"learning_rate": 1.746335319441838e-05,
"loss": 0.3254,
"step": 516
},
{
"epoch": 3.609830361026533,
"grad_norm": 0.11771321695825486,
"learning_rate": 1.7302160943498916e-05,
"loss": 0.3354,
"step": 517
},
{
"epoch": 3.6167899086559374,
"grad_norm": 0.10460865568553186,
"learning_rate": 1.7141510520583887e-05,
"loss": 0.3305,
"step": 518
},
{
"epoch": 3.6237494562853416,
"grad_norm": 0.11996000392740622,
"learning_rate": 1.698140576061538e-05,
"loss": 0.339,
"step": 519
},
{
"epoch": 3.6307090039147454,
"grad_norm": 0.09741048167997303,
"learning_rate": 1.6821850485509784e-05,
"loss": 0.3366,
"step": 520
},
{
"epoch": 3.6376685515441496,
"grad_norm": 0.1142703570198178,
"learning_rate": 1.6662848504066502e-05,
"loss": 0.3337,
"step": 521
},
{
"epoch": 3.644628099173554,
"grad_norm": 0.09709378850935774,
"learning_rate": 1.6504403611877098e-05,
"loss": 0.3322,
"step": 522
},
{
"epoch": 3.651587646802958,
"grad_norm": 0.10615913057072311,
"learning_rate": 1.6346519591234637e-05,
"loss": 0.3325,
"step": 523
},
{
"epoch": 3.6585471944323618,
"grad_norm": 0.1156860445622765,
"learning_rate": 1.6189200211043484e-05,
"loss": 0.3347,
"step": 524
},
{
"epoch": 3.665506742061766,
"grad_norm": 0.10972351654483692,
"learning_rate": 1.6032449226729195e-05,
"loss": 0.3354,
"step": 525
},
{
"epoch": 3.67246628969117,
"grad_norm": 0.12167997828635133,
"learning_rate": 1.5876270380149038e-05,
"loss": 0.3371,
"step": 526
},
{
"epoch": 3.679425837320574,
"grad_norm": 0.1227635833803301,
"learning_rate": 1.57206673995025e-05,
"loss": 0.3303,
"step": 527
},
{
"epoch": 3.686385384949978,
"grad_norm": 0.10671146343573032,
"learning_rate": 1.556564399924244e-05,
"loss": 0.3301,
"step": 528
},
{
"epoch": 3.6933449325793823,
"grad_norm": 0.11109924997469728,
"learning_rate": 1.541120387998631e-05,
"loss": 0.3295,
"step": 529
},
{
"epoch": 3.7003044802087866,
"grad_norm": 0.10312763393789623,
"learning_rate": 1.5257350728427862e-05,
"loss": 0.3361,
"step": 530
},
{
"epoch": 3.7072640278381908,
"grad_norm": 0.10629651651459636,
"learning_rate": 1.5104088217249132e-05,
"loss": 0.3321,
"step": 531
},
{
"epoch": 3.7142235754675945,
"grad_norm": 0.09725532964037208,
"learning_rate": 1.4951420005032828e-05,
"loss": 0.3379,
"step": 532
},
{
"epoch": 3.7211831230969987,
"grad_norm": 0.11511675877236498,
"learning_rate": 1.4799349736174891e-05,
"loss": 0.3307,
"step": 533
},
{
"epoch": 3.728142670726403,
"grad_norm": 0.09824052462354282,
"learning_rate": 1.4647881040797547e-05,
"loss": 0.3273,
"step": 534
},
{
"epoch": 3.7351022183558067,
"grad_norm": 0.10324613912974012,
"learning_rate": 1.4497017534662651e-05,
"loss": 0.3344,
"step": 535
},
{
"epoch": 3.742061765985211,
"grad_norm": 0.10581522915369096,
"learning_rate": 1.4346762819085424e-05,
"loss": 0.3342,
"step": 536
},
{
"epoch": 3.749021313614615,
"grad_norm": 0.10101793752872175,
"learning_rate": 1.4197120480848381e-05,
"loss": 0.3348,
"step": 537
},
{
"epoch": 3.7559808612440193,
"grad_norm": 0.10390774039864187,
"learning_rate": 1.4048094092115774e-05,
"loss": 0.3301,
"step": 538
},
{
"epoch": 3.762940408873423,
"grad_norm": 0.10773321694134558,
"learning_rate": 1.389968721034833e-05,
"loss": 0.3353,
"step": 539
},
{
"epoch": 3.7698999565028273,
"grad_norm": 0.09221835790298968,
"learning_rate": 1.3751903378218315e-05,
"loss": 0.3329,
"step": 540
},
{
"epoch": 3.7768595041322315,
"grad_norm": 0.10820822350883358,
"learning_rate": 1.3604746123524932e-05,
"loss": 0.3278,
"step": 541
},
{
"epoch": 3.7838190517616352,
"grad_norm": 0.10470082245347596,
"learning_rate": 1.3458218959110152e-05,
"loss": 0.3371,
"step": 542
},
{
"epoch": 3.7907785993910394,
"grad_norm": 0.10207051909220886,
"learning_rate": 1.3312325382774827e-05,
"loss": 0.3371,
"step": 543
},
{
"epoch": 3.7977381470204437,
"grad_norm": 0.10101453592738402,
"learning_rate": 1.3167068877195237e-05,
"loss": 0.3265,
"step": 544
},
{
"epoch": 3.804697694649848,
"grad_norm": 0.09285174719990112,
"learning_rate": 1.3022452909839918e-05,
"loss": 0.3277,
"step": 545
},
{
"epoch": 3.811657242279252,
"grad_norm": 0.1059908590081114,
"learning_rate": 1.2878480932886874e-05,
"loss": 0.334,
"step": 546
},
{
"epoch": 3.818616789908656,
"grad_norm": 0.10635953000158242,
"learning_rate": 1.2735156383141187e-05,
"loss": 0.3325,
"step": 547
},
{
"epoch": 3.82557633753806,
"grad_norm": 0.0985299908106167,
"learning_rate": 1.2592482681953025e-05,
"loss": 0.3317,
"step": 548
},
{
"epoch": 3.8325358851674642,
"grad_norm": 0.10588236607635328,
"learning_rate": 1.2450463235135874e-05,
"loss": 0.34,
"step": 549
},
{
"epoch": 3.839495432796868,
"grad_norm": 0.10316866804543555,
"learning_rate": 1.2309101432885302e-05,
"loss": 0.3347,
"step": 550
},
{
"epoch": 3.846454980426272,
"grad_norm": 0.09961037740811082,
"learning_rate": 1.2168400649698039e-05,
"loss": 0.3351,
"step": 551
},
{
"epoch": 3.8534145280556764,
"grad_norm": 0.09817204131281733,
"learning_rate": 1.202836424429135e-05,
"loss": 0.3365,
"step": 552
},
{
"epoch": 3.8603740756850806,
"grad_norm": 0.09481914220388948,
"learning_rate": 1.1888995559522974e-05,
"loss": 0.3292,
"step": 553
},
{
"epoch": 3.867333623314485,
"grad_norm": 0.10082172771379438,
"learning_rate": 1.1750297922311193e-05,
"loss": 0.3335,
"step": 554
},
{
"epoch": 3.8742931709438886,
"grad_norm": 0.08986399832050056,
"learning_rate": 1.1612274643555504e-05,
"loss": 0.3284,
"step": 555
},
{
"epoch": 3.881252718573293,
"grad_norm": 0.08986088415045988,
"learning_rate": 1.1474929018057574e-05,
"loss": 0.3345,
"step": 556
},
{
"epoch": 3.8882122662026966,
"grad_norm": 0.09403102777418895,
"learning_rate": 1.1338264324442573e-05,
"loss": 0.3315,
"step": 557
},
{
"epoch": 3.8951718138321008,
"grad_norm": 0.09262927753397106,
"learning_rate": 1.1202283825080884e-05,
"loss": 0.3282,
"step": 558
},
{
"epoch": 3.902131361461505,
"grad_norm": 0.0934685977047065,
"learning_rate": 1.1066990766010274e-05,
"loss": 0.3337,
"step": 559
},
{
"epoch": 3.909090909090909,
"grad_norm": 0.09165216810312578,
"learning_rate": 1.093238837685835e-05,
"loss": 0.3318,
"step": 560
},
{
"epoch": 3.9160504567203134,
"grad_norm": 0.09756546339699468,
"learning_rate": 1.0798479870765558e-05,
"loss": 0.3282,
"step": 561
},
{
"epoch": 3.923010004349717,
"grad_norm": 0.09062420344108967,
"learning_rate": 1.0665268444308366e-05,
"loss": 0.3305,
"step": 562
},
{
"epoch": 3.9299695519791213,
"grad_norm": 0.09673675762215994,
"learning_rate": 1.0532757277423019e-05,
"loss": 0.3291,
"step": 563
},
{
"epoch": 3.9369290996085256,
"grad_norm": 0.0933424618907197,
"learning_rate": 1.0400949533329653e-05,
"loss": 0.3414,
"step": 564
},
{
"epoch": 3.9438886472379293,
"grad_norm": 0.08961372593224715,
"learning_rate": 1.0269848358456743e-05,
"loss": 0.3262,
"step": 565
},
{
"epoch": 3.9508481948673335,
"grad_norm": 0.1180626958636412,
"learning_rate": 1.0139456882365981e-05,
"loss": 0.3379,
"step": 566
},
{
"epoch": 3.9578077424967377,
"grad_norm": 0.08706924109298092,
"learning_rate": 1.0009778217677617e-05,
"loss": 0.3356,
"step": 567
},
{
"epoch": 3.964767290126142,
"grad_norm": 0.09192179644041273,
"learning_rate": 9.880815459996102e-06,
"loss": 0.3353,
"step": 568
},
{
"epoch": 3.971726837755546,
"grad_norm": 0.08744584106099203,
"learning_rate": 9.752571687836267e-06,
"loss": 0.3275,
"step": 569
},
{
"epoch": 3.97868638538495,
"grad_norm": 0.09171104289414916,
"learning_rate": 9.625049962549768e-06,
"loss": 0.3334,
"step": 570
},
{
"epoch": 3.985645933014354,
"grad_norm": 0.09202503387494353,
"learning_rate": 9.498253328252023e-06,
"loss": 0.3311,
"step": 571
},
{
"epoch": 3.9926054806437583,
"grad_norm": 0.09047931089059162,
"learning_rate": 9.372184811749544e-06,
"loss": 0.3316,
"step": 572
},
{
"epoch": 4.003479773814702,
"grad_norm": 0.11221100584378,
"learning_rate": 9.246847422467718e-06,
"loss": 0.3252,
"step": 573
},
{
"epoch": 4.010439321444106,
"grad_norm": 0.14511168880726213,
"learning_rate": 9.122244152378919e-06,
"loss": 0.3121,
"step": 574
},
{
"epoch": 4.01739886907351,
"grad_norm": 0.11527962568493744,
"learning_rate": 8.998377975931096e-06,
"loss": 0.3038,
"step": 575
},
{
"epoch": 4.024358416702914,
"grad_norm": 0.10476073099305397,
"learning_rate": 8.875251849976823e-06,
"loss": 0.3086,
"step": 576
},
{
"epoch": 4.0313179643323185,
"grad_norm": 0.12889927756242409,
"learning_rate": 8.752868713702617e-06,
"loss": 0.3109,
"step": 577
},
{
"epoch": 4.038277511961723,
"grad_norm": 0.1290059820672822,
"learning_rate": 8.63123148855888e-06,
"loss": 0.3054,
"step": 578
},
{
"epoch": 4.045237059591127,
"grad_norm": 0.12434412735412093,
"learning_rate": 8.510343078190075e-06,
"loss": 0.3147,
"step": 579
},
{
"epoch": 4.052196607220531,
"grad_norm": 0.10748206301685143,
"learning_rate": 8.39020636836545e-06,
"loss": 0.3075,
"step": 580
},
{
"epoch": 4.059156154849934,
"grad_norm": 0.10972499765580897,
"learning_rate": 8.270824226910163e-06,
"loss": 0.3078,
"step": 581
},
{
"epoch": 4.066115702479339,
"grad_norm": 0.11672965349240796,
"learning_rate": 8.152199503636819e-06,
"loss": 0.3108,
"step": 582
},
{
"epoch": 4.073075250108743,
"grad_norm": 0.11513790902472179,
"learning_rate": 8.034335030277406e-06,
"loss": 0.3034,
"step": 583
},
{
"epoch": 4.080034797738147,
"grad_norm": 0.10714140673258672,
"learning_rate": 7.917233620415716e-06,
"loss": 0.3101,
"step": 584
},
{
"epoch": 4.086994345367551,
"grad_norm": 0.10443849628188653,
"learning_rate": 7.800898069420203e-06,
"loss": 0.3119,
"step": 585
},
{
"epoch": 4.0939538929969554,
"grad_norm": 0.10270537031965189,
"learning_rate": 7.685331154377254e-06,
"loss": 0.3108,
"step": 586
},
{
"epoch": 4.10091344062636,
"grad_norm": 0.10002934787588738,
"learning_rate": 7.570535634024847e-06,
"loss": 0.3116,
"step": 587
},
{
"epoch": 4.107872988255763,
"grad_norm": 0.1020168199335099,
"learning_rate": 7.456514248686737e-06,
"loss": 0.313,
"step": 588
},
{
"epoch": 4.114832535885167,
"grad_norm": 0.09475662900974856,
"learning_rate": 7.343269720207051e-06,
"loss": 0.3187,
"step": 589
},
{
"epoch": 4.121792083514571,
"grad_norm": 0.09367174029086878,
"learning_rate": 7.2308047518852895e-06,
"loss": 0.3054,
"step": 590
},
{
"epoch": 4.128751631143976,
"grad_norm": 0.09991070813606238,
"learning_rate": 7.119122028411798e-06,
"loss": 0.3094,
"step": 591
},
{
"epoch": 4.13571117877338,
"grad_norm": 0.0971805873557556,
"learning_rate": 7.008224215803672e-06,
"loss": 0.3149,
"step": 592
},
{
"epoch": 4.142670726402784,
"grad_norm": 0.0921029731957292,
"learning_rate": 6.898113961341128e-06,
"loss": 0.3101,
"step": 593
},
{
"epoch": 4.149630274032188,
"grad_norm": 0.09523088486551998,
"learning_rate": 6.788793893504335e-06,
"loss": 0.3052,
"step": 594
},
{
"epoch": 4.156589821661592,
"grad_norm": 0.09591717117697106,
"learning_rate": 6.680266621910632e-06,
"loss": 0.3096,
"step": 595
},
{
"epoch": 4.163549369290996,
"grad_norm": 0.09084622178996993,
"learning_rate": 6.5725347372522204e-06,
"loss": 0.3137,
"step": 596
},
{
"epoch": 4.1705089169204,
"grad_norm": 0.08687253646904729,
"learning_rate": 6.465600811234356e-06,
"loss": 0.3108,
"step": 597
},
{
"epoch": 4.177468464549804,
"grad_norm": 0.09181114682155883,
"learning_rate": 6.3594673965139675e-06,
"loss": 0.3079,
"step": 598
},
{
"epoch": 4.184428012179208,
"grad_norm": 0.09294790321001072,
"learning_rate": 6.254137026638676e-06,
"loss": 0.3063,
"step": 599
},
{
"epoch": 4.1913875598086126,
"grad_norm": 0.08710789978827478,
"learning_rate": 6.149612215986334e-06,
"loss": 0.3067,
"step": 600
},
{
"epoch": 4.198347107438017,
"grad_norm": 0.08713529476536094,
"learning_rate": 6.045895459705042e-06,
"loss": 0.3106,
"step": 601
},
{
"epoch": 4.205306655067421,
"grad_norm": 0.09233253347150959,
"learning_rate": 5.94298923365352e-06,
"loss": 0.3075,
"step": 602
},
{
"epoch": 4.212266202696824,
"grad_norm": 0.08771080578385054,
"learning_rate": 5.840895994342068e-06,
"loss": 0.3115,
"step": 603
},
{
"epoch": 4.2192257503262285,
"grad_norm": 0.08701575520285076,
"learning_rate": 5.7396181788738735e-06,
"loss": 0.3115,
"step": 604
},
{
"epoch": 4.226185297955633,
"grad_norm": 0.08851880459624953,
"learning_rate": 5.639158204886861e-06,
"loss": 0.3135,
"step": 605
},
{
"epoch": 4.233144845585037,
"grad_norm": 0.09269071200856169,
"learning_rate": 5.539518470495991e-06,
"loss": 0.3122,
"step": 606
},
{
"epoch": 4.240104393214441,
"grad_norm": 0.08855364356761067,
"learning_rate": 5.440701354235995e-06,
"loss": 0.3064,
"step": 607
},
{
"epoch": 4.247063940843845,
"grad_norm": 0.08628014488530013,
"learning_rate": 5.3427092150045975e-06,
"loss": 0.3075,
"step": 608
},
{
"epoch": 4.2540234884732495,
"grad_norm": 0.09140056125384614,
"learning_rate": 5.24554439200621e-06,
"loss": 0.3094,
"step": 609
},
{
"epoch": 4.260983036102654,
"grad_norm": 0.09432054790072364,
"learning_rate": 5.149209204696073e-06,
"loss": 0.3129,
"step": 610
},
{
"epoch": 4.267942583732057,
"grad_norm": 0.08230546321550815,
"learning_rate": 5.05370595272495e-06,
"loss": 0.3129,
"step": 611
},
{
"epoch": 4.274902131361461,
"grad_norm": 0.08826471384141378,
"learning_rate": 4.959036915884134e-06,
"loss": 0.3176,
"step": 612
},
{
"epoch": 4.2818616789908654,
"grad_norm": 0.08473384022161361,
"learning_rate": 4.865204354051129e-06,
"loss": 0.3031,
"step": 613
},
{
"epoch": 4.28882122662027,
"grad_norm": 0.09157501033682776,
"learning_rate": 4.7722105071356065e-06,
"loss": 0.3083,
"step": 614
},
{
"epoch": 4.295780774249674,
"grad_norm": 0.08590767447738862,
"learning_rate": 4.68005759502602e-06,
"loss": 0.3089,
"step": 615
},
{
"epoch": 4.302740321879078,
"grad_norm": 0.0828024048843273,
"learning_rate": 4.588747817536563e-06,
"loss": 0.3157,
"step": 616
},
{
"epoch": 4.309699869508482,
"grad_norm": 0.08180623225652858,
"learning_rate": 4.498283354354654e-06,
"loss": 0.3049,
"step": 617
},
{
"epoch": 4.3166594171378865,
"grad_norm": 0.08341811378979368,
"learning_rate": 4.408666364988938e-06,
"loss": 0.3146,
"step": 618
},
{
"epoch": 4.32361896476729,
"grad_norm": 0.08072676966602048,
"learning_rate": 4.31989898871771e-06,
"loss": 0.3121,
"step": 619
},
{
"epoch": 4.330578512396694,
"grad_norm": 0.07811857676852015,
"learning_rate": 4.231983344537875e-06,
"loss": 0.3056,
"step": 620
},
{
"epoch": 4.337538060026098,
"grad_norm": 0.07963712695352229,
"learning_rate": 4.144921531114317e-06,
"loss": 0.3092,
"step": 621
},
{
"epoch": 4.344497607655502,
"grad_norm": 0.08106713806356201,
"learning_rate": 4.058715626729837e-06,
"loss": 0.3087,
"step": 622
},
{
"epoch": 4.351457155284907,
"grad_norm": 0.0825669290953596,
"learning_rate": 3.973367689235548e-06,
"loss": 0.3124,
"step": 623
},
{
"epoch": 4.358416702914311,
"grad_norm": 0.08346710243966363,
"learning_rate": 3.888879756001726e-06,
"loss": 0.3097,
"step": 624
},
{
"epoch": 4.365376250543715,
"grad_norm": 0.07952645915127651,
"learning_rate": 3.805253843869179e-06,
"loss": 0.3082,
"step": 625
},
{
"epoch": 4.372335798173118,
"grad_norm": 0.07932372481956639,
"learning_rate": 3.72249194910113e-06,
"loss": 0.3172,
"step": 626
},
{
"epoch": 4.3792953458025226,
"grad_norm": 0.07754167774238424,
"learning_rate": 3.6405960473355183e-06,
"loss": 0.3082,
"step": 627
},
{
"epoch": 4.386254893431927,
"grad_norm": 0.08136667422744384,
"learning_rate": 3.5595680935378972e-06,
"loss": 0.3098,
"step": 628
},
{
"epoch": 4.393214441061331,
"grad_norm": 0.08485744461035595,
"learning_rate": 3.4794100219546967e-06,
"loss": 0.3132,
"step": 629
},
{
"epoch": 4.400173988690735,
"grad_norm": 0.07736893733783809,
"learning_rate": 3.400123746067099e-06,
"loss": 0.3057,
"step": 630
},
{
"epoch": 4.407133536320139,
"grad_norm": 0.07691011304383329,
"learning_rate": 3.321711158545351e-06,
"loss": 0.3092,
"step": 631
},
{
"epoch": 4.414093083949544,
"grad_norm": 0.07876827881942446,
"learning_rate": 3.2441741312036014e-06,
"loss": 0.309,
"step": 632
},
{
"epoch": 4.421052631578947,
"grad_norm": 0.08089462204978613,
"learning_rate": 3.167514514955157e-06,
"loss": 0.3105,
"step": 633
},
{
"epoch": 4.428012179208351,
"grad_norm": 0.07721348081474763,
"learning_rate": 3.0917341397683633e-06,
"loss": 0.3071,
"step": 634
},
{
"epoch": 4.434971726837755,
"grad_norm": 0.07932348813099326,
"learning_rate": 3.0168348146228842e-06,
"loss": 0.3099,
"step": 635
},
{
"epoch": 4.4419312744671595,
"grad_norm": 0.08123974569538568,
"learning_rate": 2.942818327466559e-06,
"loss": 0.3102,
"step": 636
},
{
"epoch": 4.448890822096564,
"grad_norm": 0.0805284681969405,
"learning_rate": 2.8696864451726614e-06,
"loss": 0.3167,
"step": 637
},
{
"epoch": 4.455850369725968,
"grad_norm": 0.08047835245862313,
"learning_rate": 2.79744091349778e-06,
"loss": 0.3076,
"step": 638
},
{
"epoch": 4.462809917355372,
"grad_norm": 0.08129909351756673,
"learning_rate": 2.7260834570400986e-06,
"loss": 0.3124,
"step": 639
},
{
"epoch": 4.469769464984776,
"grad_norm": 0.07839675778245354,
"learning_rate": 2.6556157791982707e-06,
"loss": 0.3079,
"step": 640
},
{
"epoch": 4.4767290126141805,
"grad_norm": 0.08062781084505634,
"learning_rate": 2.586039562130722e-06,
"loss": 0.3047,
"step": 641
},
{
"epoch": 4.483688560243584,
"grad_norm": 0.0789139952040182,
"learning_rate": 2.5173564667155015e-06,
"loss": 0.3117,
"step": 642
},
{
"epoch": 4.490648107872988,
"grad_norm": 0.07782883382645976,
"learning_rate": 2.4495681325106535e-06,
"loss": 0.3086,
"step": 643
},
{
"epoch": 4.497607655502392,
"grad_norm": 0.07695405845578089,
"learning_rate": 2.3826761777150643e-06,
"loss": 0.3075,
"step": 644
},
{
"epoch": 4.5045672031317965,
"grad_norm": 0.0776536074585525,
"learning_rate": 2.3166821991298384e-06,
"loss": 0.3116,
"step": 645
},
{
"epoch": 4.511526750761201,
"grad_norm": 0.07992836130626074,
"learning_rate": 2.2515877721201697e-06,
"loss": 0.313,
"step": 646
},
{
"epoch": 4.518486298390605,
"grad_norm": 0.07576525240054546,
"learning_rate": 2.1873944505777447e-06,
"loss": 0.3097,
"step": 647
},
{
"epoch": 4.525445846020009,
"grad_norm": 0.08056446914044152,
"learning_rate": 2.124103766883661e-06,
"loss": 0.3093,
"step": 648
},
{
"epoch": 4.532405393649412,
"grad_norm": 0.07912041219537566,
"learning_rate": 2.0617172318718205e-06,
"loss": 0.3109,
"step": 649
},
{
"epoch": 4.539364941278817,
"grad_norm": 0.0748575568058156,
"learning_rate": 2.000236334792871e-06,
"loss": 0.306,
"step": 650
},
{
"epoch": 4.546324488908221,
"grad_norm": 0.07495506939333019,
"learning_rate": 1.9396625432786866e-06,
"loss": 0.308,
"step": 651
},
{
"epoch": 4.553284036537625,
"grad_norm": 0.07516098899508962,
"learning_rate": 1.879997303307297e-06,
"loss": 0.3132,
"step": 652
},
{
"epoch": 4.560243584167029,
"grad_norm": 0.0760900687853293,
"learning_rate": 1.8212420391683761e-06,
"loss": 0.312,
"step": 653
},
{
"epoch": 4.5672031317964334,
"grad_norm": 0.07405500093323297,
"learning_rate": 1.7633981534292565e-06,
"loss": 0.3101,
"step": 654
},
{
"epoch": 4.574162679425838,
"grad_norm": 0.07510600135021905,
"learning_rate": 1.7064670269014306e-06,
"loss": 0.3065,
"step": 655
},
{
"epoch": 4.581122227055241,
"grad_norm": 0.07279725597347841,
"learning_rate": 1.65045001860761e-06,
"loss": 0.3083,
"step": 656
},
{
"epoch": 4.588081774684645,
"grad_norm": 0.07713401251486195,
"learning_rate": 1.5953484657492734e-06,
"loss": 0.3129,
"step": 657
},
{
"epoch": 4.595041322314049,
"grad_norm": 0.07573166128441056,
"learning_rate": 1.5411636836747357e-06,
"loss": 0.3111,
"step": 658
},
{
"epoch": 4.602000869943454,
"grad_norm": 0.07751898750624993,
"learning_rate": 1.4878969658477505e-06,
"loss": 0.3151,
"step": 659
},
{
"epoch": 4.608960417572858,
"grad_norm": 0.07530355421456504,
"learning_rate": 1.435549583816669e-06,
"loss": 0.3081,
"step": 660
},
{
"epoch": 4.615919965202262,
"grad_norm": 0.07642926242619338,
"learning_rate": 1.3841227871840278e-06,
"loss": 0.3133,
"step": 661
},
{
"epoch": 4.622879512831666,
"grad_norm": 0.07705536264479143,
"learning_rate": 1.3336178035767612e-06,
"loss": 0.3094,
"step": 662
},
{
"epoch": 4.62983906046107,
"grad_norm": 0.07420116640392391,
"learning_rate": 1.2840358386168972e-06,
"loss": 0.3038,
"step": 663
},
{
"epoch": 4.636798608090475,
"grad_norm": 0.07614616846884953,
"learning_rate": 1.2353780758927347e-06,
"loss": 0.311,
"step": 664
},
{
"epoch": 4.643758155719878,
"grad_norm": 0.08312089631794453,
"learning_rate": 1.1876456769306554e-06,
"loss": 0.3124,
"step": 665
},
{
"epoch": 4.650717703349282,
"grad_norm": 0.07487938485965528,
"learning_rate": 1.1408397811673376e-06,
"loss": 0.3105,
"step": 666
},
{
"epoch": 4.657677250978686,
"grad_norm": 0.07301485090244404,
"learning_rate": 1.0949615059225871e-06,
"loss": 0.3039,
"step": 667
},
{
"epoch": 4.6646367986080906,
"grad_norm": 0.07618631980688859,
"learning_rate": 1.0500119463726467e-06,
"loss": 0.3147,
"step": 668
},
{
"epoch": 4.671596346237495,
"grad_norm": 0.07505804558510307,
"learning_rate": 1.0059921755240797e-06,
"loss": 0.3114,
"step": 669
},
{
"epoch": 4.678555893866899,
"grad_norm": 0.07677226952718405,
"learning_rate": 9.62903244188147e-07,
"loss": 0.312,
"step": 670
},
{
"epoch": 4.685515441496303,
"grad_norm": 0.07748382882066837,
"learning_rate": 9.207461809556872e-07,
"loss": 0.3115,
"step": 671
},
{
"epoch": 4.6924749891257065,
"grad_norm": 0.07605829792945587,
"learning_rate": 8.795219921726139e-07,
"loss": 0.3122,
"step": 672
},
{
"epoch": 4.699434536755111,
"grad_norm": 0.07387581425092563,
"learning_rate": 8.392316619158669e-07,
"loss": 0.3074,
"step": 673
},
{
"epoch": 4.706394084384515,
"grad_norm": 0.07420555319544671,
"learning_rate": 7.998761519699205e-07,
"loss": 0.3107,
"step": 674
},
{
"epoch": 4.713353632013919,
"grad_norm": 0.07466571827662617,
"learning_rate": 7.61456401803824e-07,
"loss": 0.314,
"step": 675
},
{
"epoch": 4.720313179643323,
"grad_norm": 0.0731954788870046,
"learning_rate": 7.239733285487882e-07,
"loss": 0.3053,
"step": 676
},
{
"epoch": 4.7272727272727275,
"grad_norm": 0.07374103750191568,
"learning_rate": 6.874278269762924e-07,
"loss": 0.3098,
"step": 677
},
{
"epoch": 4.734232274902132,
"grad_norm": 0.07181151311493585,
"learning_rate": 6.518207694766965e-07,
"loss": 0.3111,
"step": 678
},
{
"epoch": 4.741191822531535,
"grad_norm": 0.0731098529031706,
"learning_rate": 6.171530060384445e-07,
"loss": 0.3057,
"step": 679
},
{
"epoch": 4.748151370160939,
"grad_norm": 0.07345303292289396,
"learning_rate": 5.834253642277655e-07,
"loss": 0.3085,
"step": 680
},
{
"epoch": 4.7551109177903434,
"grad_norm": 0.07333048475918498,
"learning_rate": 5.506386491689197e-07,
"loss": 0.307,
"step": 681
},
{
"epoch": 4.762070465419748,
"grad_norm": 0.07552740911471753,
"learning_rate": 5.187936435249796e-07,
"loss": 0.3086,
"step": 682
},
{
"epoch": 4.769030013049152,
"grad_norm": 0.07273500457357926,
"learning_rate": 4.878911074791371e-07,
"loss": 0.3121,
"step": 683
},
{
"epoch": 4.775989560678556,
"grad_norm": 0.0738654801444934,
"learning_rate": 4.57931778716576e-07,
"loss": 0.3062,
"step": 684
},
{
"epoch": 4.78294910830796,
"grad_norm": 0.07194970207986003,
"learning_rate": 4.2891637240684234e-07,
"loss": 0.3119,
"step": 685
},
{
"epoch": 4.789908655937364,
"grad_norm": 0.0747422813518781,
"learning_rate": 4.0084558118678173e-07,
"loss": 0.3081,
"step": 686
},
{
"epoch": 4.796868203566768,
"grad_norm": 0.07338451912151427,
"learning_rate": 3.7372007514401063e-07,
"loss": 0.3114,
"step": 687
},
{
"epoch": 4.803827751196172,
"grad_norm": 0.07345199197033869,
"learning_rate": 3.4754050180090704e-07,
"loss": 0.3016,
"step": 688
},
{
"epoch": 4.810787298825576,
"grad_norm": 0.07227607987729133,
"learning_rate": 3.223074860991693e-07,
"loss": 0.3113,
"step": 689
},
{
"epoch": 4.81774684645498,
"grad_norm": 0.07278601000745541,
"learning_rate": 2.980216303848815e-07,
"loss": 0.3082,
"step": 690
},
{
"epoch": 4.824706394084385,
"grad_norm": 0.07611641795774764,
"learning_rate": 2.746835143941473e-07,
"loss": 0.3076,
"step": 691
},
{
"epoch": 4.831665941713789,
"grad_norm": 0.07190187291197621,
"learning_rate": 2.5229369523923853e-07,
"loss": 0.3079,
"step": 692
},
{
"epoch": 4.838625489343193,
"grad_norm": 0.0739457194295076,
"learning_rate": 2.3085270739531706e-07,
"loss": 0.3095,
"step": 693
},
{
"epoch": 4.845585036972597,
"grad_norm": 0.07353977111327974,
"learning_rate": 2.1036106268765398e-07,
"loss": 0.3087,
"step": 694
},
{
"epoch": 4.8525445846020006,
"grad_norm": 0.07274021557955242,
"learning_rate": 1.908192502794215e-07,
"loss": 0.3104,
"step": 695
},
{
"epoch": 4.859504132231405,
"grad_norm": 0.07265017310099392,
"learning_rate": 1.7222773666001336e-07,
"loss": 0.3113,
"step": 696
},
{
"epoch": 4.866463679860809,
"grad_norm": 0.0713230748017762,
"learning_rate": 1.545869656339072e-07,
"loss": 0.3086,
"step": 697
},
{
"epoch": 4.873423227490213,
"grad_norm": 0.07442042787119083,
"learning_rate": 1.3789735831009064e-07,
"loss": 0.3124,
"step": 698
},
{
"epoch": 4.880382775119617,
"grad_norm": 0.0737173934874928,
"learning_rate": 1.2215931309197626e-07,
"loss": 0.3092,
"step": 699
},
{
"epoch": 4.887342322749022,
"grad_norm": 0.07197265372516874,
"learning_rate": 1.0737320566790221e-07,
"loss": 0.311,
"step": 700
},
{
"epoch": 4.894301870378426,
"grad_norm": 0.07270890000366599,
"learning_rate": 9.35393890021885e-08,
"loss": 0.3135,
"step": 701
},
{
"epoch": 4.901261418007829,
"grad_norm": 0.07164493485234402,
"learning_rate": 8.065819332667702e-08,
"loss": 0.3063,
"step": 702
},
{
"epoch": 4.908220965637233,
"grad_norm": 0.07345553241814261,
"learning_rate": 6.872992613286223e-08,
"loss": 0.311,
"step": 703
},
{
"epoch": 4.9151805132666375,
"grad_norm": 0.0738265738394693,
"learning_rate": 5.775487216456377e-08,
"loss": 0.3123,
"step": 704
},
{
"epoch": 4.922140060896042,
"grad_norm": 0.07288034052473,
"learning_rate": 4.7733293411105216e-08,
"loss": 0.3155,
"step": 705
},
{
"epoch": 4.929099608525446,
"grad_norm": 0.0763072402074743,
"learning_rate": 3.8665429101070185e-08,
"loss": 0.3044,
"step": 706
},
{
"epoch": 4.93605915615485,
"grad_norm": 0.07435037595475792,
"learning_rate": 3.055149569660909e-08,
"loss": 0.3121,
"step": 707
},
{
"epoch": 4.943018703784254,
"grad_norm": 0.07118427621968816,
"learning_rate": 2.3391686888238894e-08,
"loss": 0.3123,
"step": 708
},
{
"epoch": 4.949978251413658,
"grad_norm": 0.07221686597134709,
"learning_rate": 1.7186173590251208e-08,
"loss": 0.3037,
"step": 709
},
{
"epoch": 4.956937799043062,
"grad_norm": 0.07211936196974679,
"learning_rate": 1.1935103936600023e-08,
"loss": 0.3109,
"step": 710
},
{
"epoch": 4.963897346672466,
"grad_norm": 0.07300879682236437,
"learning_rate": 7.63860327740229e-09,
"loss": 0.315,
"step": 711
},
{
"epoch": 4.97085689430187,
"grad_norm": 0.0746678008435265,
"learning_rate": 4.296774175918117e-09,
"loss": 0.3114,
"step": 712
},
{
"epoch": 4.9778164419312745,
"grad_norm": 0.07379213535477022,
"learning_rate": 1.909696406103834e-09,
"loss": 0.3132,
"step": 713
},
{
"epoch": 4.984775989560679,
"grad_norm": 0.072449627565001,
"learning_rate": 4.77426950733495e-10,
"loss": 0.3053,
"step": 714
},
{
"epoch": 4.991735537190083,
"grad_norm": 0.07321298642770968,
"learning_rate": 0.0,
"loss": 0.3097,
"step": 715
},
{
"epoch": 4.991735537190083,
"step": 715,
"total_flos": 1.839907874248065e+19,
"train_loss": 0.3850101018285418,
"train_runtime": 71630.1527,
"train_samples_per_second": 5.133,
"train_steps_per_second": 0.01
}
],
"logging_steps": 1,
"max_steps": 715,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.839907874248065e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}