diff --git "a/v1_1/trainer_state.json" "b/v1_1/trainer_state.json" new file mode 100644--- /dev/null +++ "b/v1_1/trainer_state.json" @@ -0,0 +1,23972 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 3000, + "global_step": 34068, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005870611717740989, + "grad_norm": 2.2037928104400635, + "learning_rate": 1e-05, + "loss": 0.0641, + "step": 10 + }, + { + "epoch": 0.0011741223435481978, + "grad_norm": 2.5393388271331787, + "learning_rate": 2e-05, + "loss": 0.0055, + "step": 20 + }, + { + "epoch": 0.0017611835153222965, + "grad_norm": 0.06004801765084267, + "learning_rate": 3e-05, + "loss": 0.0203, + "step": 30 + }, + { + "epoch": 0.0023482446870963956, + "grad_norm": 6.423142910003662, + "learning_rate": 4e-05, + "loss": 0.0313, + "step": 40 + }, + { + "epoch": 0.0029353058588704943, + "grad_norm": 1.324741005897522, + "learning_rate": 5e-05, + "loss": 0.0162, + "step": 50 + }, + { + "epoch": 0.003522367030644593, + "grad_norm": 3.5551211833953857, + "learning_rate": 6e-05, + "loss": 0.0106, + "step": 60 + }, + { + "epoch": 0.004109428202418692, + "grad_norm": 0.13222283124923706, + "learning_rate": 7e-05, + "loss": 0.0108, + "step": 70 + }, + { + "epoch": 0.004696489374192791, + "grad_norm": 1.9146218299865723, + "learning_rate": 8e-05, + "loss": 0.0172, + "step": 80 + }, + { + "epoch": 0.0052835505459668895, + "grad_norm": 1.515202522277832, + "learning_rate": 9e-05, + "loss": 0.024, + "step": 90 + }, + { + "epoch": 0.005870611717740989, + "grad_norm": 1.8395302295684814, + "learning_rate": 0.0001, + "loss": 0.0046, + "step": 100 + }, + { + "epoch": 0.006457672889515088, + "grad_norm": 0.4752308428287506, + "learning_rate": 9.999997861546707e-05, + "loss": 0.0133, + "step": 110 + }, + { + "epoch": 0.007044734061289186, + "grad_norm": 3.3688418865203857, + "learning_rate": 9.99999144618865e-05, + "loss": 0.0128, + "step": 120 + }, + { + "epoch": 0.007631795233063285, + "grad_norm": 1.3503026962280273, + "learning_rate": 9.999980753931321e-05, + "loss": 0.0141, + "step": 130 + }, + { + "epoch": 0.008218856404837384, + "grad_norm": 0.5728911757469177, + "learning_rate": 9.999965784783865e-05, + "loss": 0.0168, + "step": 140 + }, + { + "epoch": 0.008805917576611482, + "grad_norm": 0.20352545380592346, + "learning_rate": 9.999946538759087e-05, + "loss": 0.0085, + "step": 150 + }, + { + "epoch": 0.009392978748385582, + "grad_norm": 0.5463225841522217, + "learning_rate": 9.999923015873447e-05, + "loss": 0.0273, + "step": 160 + }, + { + "epoch": 0.00998003992015968, + "grad_norm": 2.2422802448272705, + "learning_rate": 9.999895216147068e-05, + "loss": 0.0206, + "step": 170 + }, + { + "epoch": 0.010567101091933779, + "grad_norm": 1.3659040927886963, + "learning_rate": 9.99986313960373e-05, + "loss": 0.024, + "step": 180 + }, + { + "epoch": 0.011154162263707879, + "grad_norm": 0.4480757713317871, + "learning_rate": 9.99982678627087e-05, + "loss": 0.0115, + "step": 190 + }, + { + "epoch": 0.011741223435481977, + "grad_norm": 0.2278975248336792, + "learning_rate": 9.999786156179584e-05, + "loss": 0.0277, + "step": 200 + }, + { + "epoch": 0.012328284607256075, + "grad_norm": 1.7811647653579712, + "learning_rate": 9.999741249364625e-05, + "loss": 0.0169, + "step": 210 + }, + { + "epoch": 0.012915345779030175, + "grad_norm": 0.5214270949363708, + "learning_rate": 9.999692065864407e-05, + "loss": 0.0129, + "step": 220 + }, + { + "epoch": 0.013502406950804274, + "grad_norm": 0.010121507570147514, + "learning_rate": 9.999638605721e-05, + "loss": 0.0047, + "step": 230 + }, + { + "epoch": 0.014089468122578372, + "grad_norm": 0.07541653513908386, + "learning_rate": 9.999580868980134e-05, + "loss": 0.0068, + "step": 240 + }, + { + "epoch": 0.014676529294352472, + "grad_norm": 2.229921817779541, + "learning_rate": 9.999518855691194e-05, + "loss": 0.0432, + "step": 250 + }, + { + "epoch": 0.01526359046612657, + "grad_norm": 1.3554790019989014, + "learning_rate": 9.999452565907225e-05, + "loss": 0.0287, + "step": 260 + }, + { + "epoch": 0.01585065163790067, + "grad_norm": 0.2221003770828247, + "learning_rate": 9.999381999684934e-05, + "loss": 0.0041, + "step": 270 + }, + { + "epoch": 0.01643771280967477, + "grad_norm": 0.7399819493293762, + "learning_rate": 9.999307157084676e-05, + "loss": 0.0187, + "step": 280 + }, + { + "epoch": 0.017024773981448867, + "grad_norm": 3.3960468769073486, + "learning_rate": 9.999228038170475e-05, + "loss": 0.0203, + "step": 290 + }, + { + "epoch": 0.017611835153222965, + "grad_norm": 1.684557318687439, + "learning_rate": 9.999144643010004e-05, + "loss": 0.0342, + "step": 300 + }, + { + "epoch": 0.018198896324997063, + "grad_norm": 0.7973483204841614, + "learning_rate": 9.999056971674601e-05, + "loss": 0.0176, + "step": 310 + }, + { + "epoch": 0.018785957496771165, + "grad_norm": 1.082037091255188, + "learning_rate": 9.998965024239256e-05, + "loss": 0.014, + "step": 320 + }, + { + "epoch": 0.019373018668545263, + "grad_norm": 0.7168899178504944, + "learning_rate": 9.99886880078262e-05, + "loss": 0.0193, + "step": 330 + }, + { + "epoch": 0.01996007984031936, + "grad_norm": 0.8778390884399414, + "learning_rate": 9.998768301387001e-05, + "loss": 0.0147, + "step": 340 + }, + { + "epoch": 0.02054714101209346, + "grad_norm": 0.03346811980009079, + "learning_rate": 9.998663526138365e-05, + "loss": 0.0255, + "step": 350 + }, + { + "epoch": 0.021134202183867558, + "grad_norm": 0.2908102571964264, + "learning_rate": 9.998554475126332e-05, + "loss": 0.0155, + "step": 360 + }, + { + "epoch": 0.02172126335564166, + "grad_norm": 0.8321800231933594, + "learning_rate": 9.998441148444184e-05, + "loss": 0.0202, + "step": 370 + }, + { + "epoch": 0.022308324527415758, + "grad_norm": 1.5566340684890747, + "learning_rate": 9.99832354618886e-05, + "loss": 0.0183, + "step": 380 + }, + { + "epoch": 0.022895385699189856, + "grad_norm": 0.016553422436118126, + "learning_rate": 9.998201668460952e-05, + "loss": 0.0262, + "step": 390 + }, + { + "epoch": 0.023482446870963954, + "grad_norm": 2.6109025478363037, + "learning_rate": 9.998075515364715e-05, + "loss": 0.0388, + "step": 400 + }, + { + "epoch": 0.024069508042738053, + "grad_norm": 3.345412254333496, + "learning_rate": 9.997945087008055e-05, + "loss": 0.0183, + "step": 410 + }, + { + "epoch": 0.02465656921451215, + "grad_norm": 0.02132261171936989, + "learning_rate": 9.99781038350254e-05, + "loss": 0.0198, + "step": 420 + }, + { + "epoch": 0.025243630386286253, + "grad_norm": 2.097240447998047, + "learning_rate": 9.997671404963391e-05, + "loss": 0.0133, + "step": 430 + }, + { + "epoch": 0.02583069155806035, + "grad_norm": 1.1681588888168335, + "learning_rate": 9.99752815150949e-05, + "loss": 0.0208, + "step": 440 + }, + { + "epoch": 0.02641775272983445, + "grad_norm": 0.15087229013442993, + "learning_rate": 9.99738062326337e-05, + "loss": 0.0122, + "step": 450 + }, + { + "epoch": 0.027004813901608547, + "grad_norm": 1.4533928632736206, + "learning_rate": 9.997228820351228e-05, + "loss": 0.0259, + "step": 460 + }, + { + "epoch": 0.027591875073382646, + "grad_norm": 3.868213176727295, + "learning_rate": 9.997072742902912e-05, + "loss": 0.0163, + "step": 470 + }, + { + "epoch": 0.028178936245156744, + "grad_norm": 1.1523321866989136, + "learning_rate": 9.996912391051925e-05, + "loss": 0.0178, + "step": 480 + }, + { + "epoch": 0.028765997416930845, + "grad_norm": 1.964536190032959, + "learning_rate": 9.996747764935431e-05, + "loss": 0.0203, + "step": 490 + }, + { + "epoch": 0.029353058588704944, + "grad_norm": 1.6125319004058838, + "learning_rate": 9.996578864694249e-05, + "loss": 0.0199, + "step": 500 + }, + { + "epoch": 0.029940119760479042, + "grad_norm": 1.3507105112075806, + "learning_rate": 9.996405690472852e-05, + "loss": 0.0239, + "step": 510 + }, + { + "epoch": 0.03052718093225314, + "grad_norm": 3.7466275691986084, + "learning_rate": 9.996228242419372e-05, + "loss": 0.0211, + "step": 520 + }, + { + "epoch": 0.03111424210402724, + "grad_norm": 1.5872238874435425, + "learning_rate": 9.996046520685592e-05, + "loss": 0.0338, + "step": 530 + }, + { + "epoch": 0.03170130327580134, + "grad_norm": 1.883832335472107, + "learning_rate": 9.995860525426954e-05, + "loss": 0.0154, + "step": 540 + }, + { + "epoch": 0.032288364447575435, + "grad_norm": 1.1487303972244263, + "learning_rate": 9.995670256802554e-05, + "loss": 0.0248, + "step": 550 + }, + { + "epoch": 0.03287542561934954, + "grad_norm": 0.1820375919342041, + "learning_rate": 9.995475714975146e-05, + "loss": 0.0173, + "step": 560 + }, + { + "epoch": 0.03346248679112364, + "grad_norm": 0.43540775775909424, + "learning_rate": 9.995276900111139e-05, + "loss": 0.0198, + "step": 570 + }, + { + "epoch": 0.03404954796289773, + "grad_norm": 1.9775478839874268, + "learning_rate": 9.995073812380594e-05, + "loss": 0.0257, + "step": 580 + }, + { + "epoch": 0.034636609134671835, + "grad_norm": 2.404892683029175, + "learning_rate": 9.994866451957225e-05, + "loss": 0.0212, + "step": 590 + }, + { + "epoch": 0.03522367030644593, + "grad_norm": 1.327643871307373, + "learning_rate": 9.994654819018408e-05, + "loss": 0.019, + "step": 600 + }, + { + "epoch": 0.03581073147822003, + "grad_norm": 0.5496881604194641, + "learning_rate": 9.99443891374517e-05, + "loss": 0.0202, + "step": 610 + }, + { + "epoch": 0.036397792649994126, + "grad_norm": 1.8398231267929077, + "learning_rate": 9.99421873632219e-05, + "loss": 0.0307, + "step": 620 + }, + { + "epoch": 0.03698485382176823, + "grad_norm": 2.2247824668884277, + "learning_rate": 9.993994286937805e-05, + "loss": 0.0275, + "step": 630 + }, + { + "epoch": 0.03757191499354233, + "grad_norm": 2.914898157119751, + "learning_rate": 9.993765565784006e-05, + "loss": 0.0211, + "step": 640 + }, + { + "epoch": 0.038158976165316424, + "grad_norm": 2.512962579727173, + "learning_rate": 9.993532573056436e-05, + "loss": 0.016, + "step": 650 + }, + { + "epoch": 0.038746037337090526, + "grad_norm": 3.5847184658050537, + "learning_rate": 9.993295308954391e-05, + "loss": 0.0274, + "step": 660 + }, + { + "epoch": 0.03933309850886462, + "grad_norm": 1.5583494901657104, + "learning_rate": 9.993053773680823e-05, + "loss": 0.0237, + "step": 670 + }, + { + "epoch": 0.03992015968063872, + "grad_norm": 1.3348017930984497, + "learning_rate": 9.992807967442339e-05, + "loss": 0.0273, + "step": 680 + }, + { + "epoch": 0.040507220852412824, + "grad_norm": 0.9014214277267456, + "learning_rate": 9.992557890449195e-05, + "loss": 0.0182, + "step": 690 + }, + { + "epoch": 0.04109428202418692, + "grad_norm": 1.2551409006118774, + "learning_rate": 9.992303542915302e-05, + "loss": 0.0463, + "step": 700 + }, + { + "epoch": 0.04168134319596102, + "grad_norm": 1.183083415031433, + "learning_rate": 9.992044925058224e-05, + "loss": 0.0356, + "step": 710 + }, + { + "epoch": 0.042268404367735116, + "grad_norm": 1.657358169555664, + "learning_rate": 9.99178203709918e-05, + "loss": 0.0368, + "step": 720 + }, + { + "epoch": 0.04285546553950922, + "grad_norm": 2.1028082370758057, + "learning_rate": 9.991514879263038e-05, + "loss": 0.0266, + "step": 730 + }, + { + "epoch": 0.04344252671128332, + "grad_norm": 3.882103443145752, + "learning_rate": 9.991243451778318e-05, + "loss": 0.0392, + "step": 740 + }, + { + "epoch": 0.044029587883057414, + "grad_norm": 11.489930152893066, + "learning_rate": 9.990967754877197e-05, + "loss": 0.0376, + "step": 750 + }, + { + "epoch": 0.044616649054831516, + "grad_norm": 1.9137790203094482, + "learning_rate": 9.9906877887955e-05, + "loss": 0.0156, + "step": 760 + }, + { + "epoch": 0.04520371022660561, + "grad_norm": 2.4942731857299805, + "learning_rate": 9.990403553772704e-05, + "loss": 0.034, + "step": 770 + }, + { + "epoch": 0.04579077139837971, + "grad_norm": 2.418353796005249, + "learning_rate": 9.990115050051939e-05, + "loss": 0.0255, + "step": 780 + }, + { + "epoch": 0.04637783257015381, + "grad_norm": 2.717972993850708, + "learning_rate": 9.989822277879985e-05, + "loss": 0.0201, + "step": 790 + }, + { + "epoch": 0.04696489374192791, + "grad_norm": 0.6746355891227722, + "learning_rate": 9.989525237507276e-05, + "loss": 0.0336, + "step": 800 + }, + { + "epoch": 0.04755195491370201, + "grad_norm": 0.4549112021923065, + "learning_rate": 9.989223929187893e-05, + "loss": 0.0493, + "step": 810 + }, + { + "epoch": 0.048139016085476105, + "grad_norm": 4.386491298675537, + "learning_rate": 9.988918353179568e-05, + "loss": 0.0359, + "step": 820 + }, + { + "epoch": 0.04872607725725021, + "grad_norm": 2.7425849437713623, + "learning_rate": 9.988608509743688e-05, + "loss": 0.0363, + "step": 830 + }, + { + "epoch": 0.0493131384290243, + "grad_norm": 1.0640265941619873, + "learning_rate": 9.988294399145285e-05, + "loss": 0.0326, + "step": 840 + }, + { + "epoch": 0.0499001996007984, + "grad_norm": 1.4166111946105957, + "learning_rate": 9.987976021653046e-05, + "loss": 0.0211, + "step": 850 + }, + { + "epoch": 0.050487260772572505, + "grad_norm": 2.1535682678222656, + "learning_rate": 9.987653377539303e-05, + "loss": 0.0284, + "step": 860 + }, + { + "epoch": 0.0510743219443466, + "grad_norm": 2.582031726837158, + "learning_rate": 9.987326467080041e-05, + "loss": 0.0428, + "step": 870 + }, + { + "epoch": 0.0516613831161207, + "grad_norm": 4.288167953491211, + "learning_rate": 9.98699529055489e-05, + "loss": 0.0263, + "step": 880 + }, + { + "epoch": 0.052248444287894796, + "grad_norm": 0.8033009171485901, + "learning_rate": 9.986659848247135e-05, + "loss": 0.0371, + "step": 890 + }, + { + "epoch": 0.0528355054596689, + "grad_norm": 1.2396597862243652, + "learning_rate": 9.986320140443708e-05, + "loss": 0.023, + "step": 900 + }, + { + "epoch": 0.053422566631443, + "grad_norm": 2.016923189163208, + "learning_rate": 9.985976167435187e-05, + "loss": 0.0465, + "step": 910 + }, + { + "epoch": 0.054009627803217095, + "grad_norm": 2.335644006729126, + "learning_rate": 9.9856279295158e-05, + "loss": 0.0285, + "step": 920 + }, + { + "epoch": 0.054596688974991196, + "grad_norm": 2.3769099712371826, + "learning_rate": 9.985275426983425e-05, + "loss": 0.0367, + "step": 930 + }, + { + "epoch": 0.05518375014676529, + "grad_norm": 0.9392242431640625, + "learning_rate": 9.984918660139583e-05, + "loss": 0.0346, + "step": 940 + }, + { + "epoch": 0.05577081131853939, + "grad_norm": 0.7598341107368469, + "learning_rate": 9.984557629289449e-05, + "loss": 0.0449, + "step": 950 + }, + { + "epoch": 0.05635787249031349, + "grad_norm": 0.39787986874580383, + "learning_rate": 9.984192334741839e-05, + "loss": 0.0218, + "step": 960 + }, + { + "epoch": 0.05694493366208759, + "grad_norm": 0.6477892994880676, + "learning_rate": 9.98382277680922e-05, + "loss": 0.0338, + "step": 970 + }, + { + "epoch": 0.05753199483386169, + "grad_norm": 2.9608235359191895, + "learning_rate": 9.983448955807708e-05, + "loss": 0.0146, + "step": 980 + }, + { + "epoch": 0.058119056005635786, + "grad_norm": 0.14413294196128845, + "learning_rate": 9.983070872057059e-05, + "loss": 0.0476, + "step": 990 + }, + { + "epoch": 0.05870611717740989, + "grad_norm": 3.8315868377685547, + "learning_rate": 9.982688525880679e-05, + "loss": 0.026, + "step": 1000 + }, + { + "epoch": 0.05929317834918398, + "grad_norm": 1.5634944438934326, + "learning_rate": 9.98230191760562e-05, + "loss": 0.0377, + "step": 1010 + }, + { + "epoch": 0.059880239520958084, + "grad_norm": 2.4407360553741455, + "learning_rate": 9.981911047562583e-05, + "loss": 0.0376, + "step": 1020 + }, + { + "epoch": 0.060467300692732186, + "grad_norm": 3.7755749225616455, + "learning_rate": 9.981515916085906e-05, + "loss": 0.0443, + "step": 1030 + }, + { + "epoch": 0.06105436186450628, + "grad_norm": 3.4814929962158203, + "learning_rate": 9.981116523513579e-05, + "loss": 0.032, + "step": 1040 + }, + { + "epoch": 0.06164142303628038, + "grad_norm": 2.6388165950775146, + "learning_rate": 9.980712870187236e-05, + "loss": 0.0653, + "step": 1050 + }, + { + "epoch": 0.06222848420805448, + "grad_norm": 0.9947832226753235, + "learning_rate": 9.980304956452153e-05, + "loss": 0.0223, + "step": 1060 + }, + { + "epoch": 0.06281554537982857, + "grad_norm": 2.903304100036621, + "learning_rate": 9.979892782657253e-05, + "loss": 0.0509, + "step": 1070 + }, + { + "epoch": 0.06340260655160268, + "grad_norm": 2.6675798892974854, + "learning_rate": 9.9794763491551e-05, + "loss": 0.0424, + "step": 1080 + }, + { + "epoch": 0.06398966772337678, + "grad_norm": 2.005167007446289, + "learning_rate": 9.979055656301905e-05, + "loss": 0.0258, + "step": 1090 + }, + { + "epoch": 0.06457672889515087, + "grad_norm": 1.8828363418579102, + "learning_rate": 9.978630704457521e-05, + "loss": 0.0478, + "step": 1100 + }, + { + "epoch": 0.06516379006692498, + "grad_norm": 0.378522664308548, + "learning_rate": 9.978201493985444e-05, + "loss": 0.0209, + "step": 1110 + }, + { + "epoch": 0.06575085123869907, + "grad_norm": 0.7414391040802002, + "learning_rate": 9.97776802525281e-05, + "loss": 0.0409, + "step": 1120 + }, + { + "epoch": 0.06633791241047317, + "grad_norm": 1.4679638147354126, + "learning_rate": 9.977330298630402e-05, + "loss": 0.0295, + "step": 1130 + }, + { + "epoch": 0.06692497358224728, + "grad_norm": 0.9425148963928223, + "learning_rate": 9.976888314492644e-05, + "loss": 0.0472, + "step": 1140 + }, + { + "epoch": 0.06751203475402137, + "grad_norm": 0.8603097200393677, + "learning_rate": 9.9764420732176e-05, + "loss": 0.0381, + "step": 1150 + }, + { + "epoch": 0.06809909592579547, + "grad_norm": 1.1279311180114746, + "learning_rate": 9.975991575186977e-05, + "loss": 0.0532, + "step": 1160 + }, + { + "epoch": 0.06868615709756956, + "grad_norm": 1.5424326658248901, + "learning_rate": 9.97553682078612e-05, + "loss": 0.0275, + "step": 1170 + }, + { + "epoch": 0.06927321826934367, + "grad_norm": 1.0373824834823608, + "learning_rate": 9.975077810404021e-05, + "loss": 0.0511, + "step": 1180 + }, + { + "epoch": 0.06986027944111776, + "grad_norm": 0.08669783174991608, + "learning_rate": 9.974614544433307e-05, + "loss": 0.0148, + "step": 1190 + }, + { + "epoch": 0.07044734061289186, + "grad_norm": 2.591121196746826, + "learning_rate": 9.974147023270249e-05, + "loss": 0.0343, + "step": 1200 + }, + { + "epoch": 0.07103440178466597, + "grad_norm": 2.7418529987335205, + "learning_rate": 9.973675247314753e-05, + "loss": 0.0391, + "step": 1210 + }, + { + "epoch": 0.07162146295644006, + "grad_norm": 1.4994189739227295, + "learning_rate": 9.973199216970368e-05, + "loss": 0.0432, + "step": 1220 + }, + { + "epoch": 0.07220852412821416, + "grad_norm": 3.171295166015625, + "learning_rate": 9.972718932644283e-05, + "loss": 0.0228, + "step": 1230 + }, + { + "epoch": 0.07279558529998825, + "grad_norm": 1.6535285711288452, + "learning_rate": 9.972234394747324e-05, + "loss": 0.0222, + "step": 1240 + }, + { + "epoch": 0.07338264647176236, + "grad_norm": 1.2440599203109741, + "learning_rate": 9.971745603693956e-05, + "loss": 0.0239, + "step": 1250 + }, + { + "epoch": 0.07396970764353646, + "grad_norm": 0.9509127140045166, + "learning_rate": 9.971252559902277e-05, + "loss": 0.0236, + "step": 1260 + }, + { + "epoch": 0.07455676881531055, + "grad_norm": 2.0213236808776855, + "learning_rate": 9.970755263794035e-05, + "loss": 0.0441, + "step": 1270 + }, + { + "epoch": 0.07514382998708466, + "grad_norm": 2.064988851547241, + "learning_rate": 9.970253715794603e-05, + "loss": 0.0308, + "step": 1280 + }, + { + "epoch": 0.07573089115885875, + "grad_norm": 3.5013694763183594, + "learning_rate": 9.969747916332996e-05, + "loss": 0.0298, + "step": 1290 + }, + { + "epoch": 0.07631795233063285, + "grad_norm": 1.9559962749481201, + "learning_rate": 9.969237865841867e-05, + "loss": 0.0224, + "step": 1300 + }, + { + "epoch": 0.07690501350240696, + "grad_norm": 0.7560620307922363, + "learning_rate": 9.968723564757503e-05, + "loss": 0.0356, + "step": 1310 + }, + { + "epoch": 0.07749207467418105, + "grad_norm": 5.014816761016846, + "learning_rate": 9.968205013519826e-05, + "loss": 0.0483, + "step": 1320 + }, + { + "epoch": 0.07807913584595515, + "grad_norm": 3.6365370750427246, + "learning_rate": 9.967682212572398e-05, + "loss": 0.0227, + "step": 1330 + }, + { + "epoch": 0.07866619701772924, + "grad_norm": 1.2204110622406006, + "learning_rate": 9.967155162362413e-05, + "loss": 0.0604, + "step": 1340 + }, + { + "epoch": 0.07925325818950335, + "grad_norm": 4.171288967132568, + "learning_rate": 9.966623863340696e-05, + "loss": 0.0326, + "step": 1350 + }, + { + "epoch": 0.07984031936127745, + "grad_norm": 1.4878485202789307, + "learning_rate": 9.966088315961715e-05, + "loss": 0.0336, + "step": 1360 + }, + { + "epoch": 0.08042738053305154, + "grad_norm": 2.7584917545318604, + "learning_rate": 9.965548520683563e-05, + "loss": 0.0267, + "step": 1370 + }, + { + "epoch": 0.08101444170482565, + "grad_norm": 0.9514058828353882, + "learning_rate": 9.965004477967974e-05, + "loss": 0.0412, + "step": 1380 + }, + { + "epoch": 0.08160150287659974, + "grad_norm": 3.680694103240967, + "learning_rate": 9.964456188280311e-05, + "loss": 0.0243, + "step": 1390 + }, + { + "epoch": 0.08218856404837384, + "grad_norm": 0.7756885886192322, + "learning_rate": 9.96390365208957e-05, + "loss": 0.0221, + "step": 1400 + }, + { + "epoch": 0.08277562522014793, + "grad_norm": 1.2462010383605957, + "learning_rate": 9.96334686986838e-05, + "loss": 0.0475, + "step": 1410 + }, + { + "epoch": 0.08336268639192204, + "grad_norm": 1.963977336883545, + "learning_rate": 9.962785842093003e-05, + "loss": 0.0259, + "step": 1420 + }, + { + "epoch": 0.08394974756369614, + "grad_norm": 5.26522159576416, + "learning_rate": 9.962220569243332e-05, + "loss": 0.0441, + "step": 1430 + }, + { + "epoch": 0.08453680873547023, + "grad_norm": 2.0117955207824707, + "learning_rate": 9.961651051802891e-05, + "loss": 0.0552, + "step": 1440 + }, + { + "epoch": 0.08512386990724434, + "grad_norm": 4.85704231262207, + "learning_rate": 9.961077290258833e-05, + "loss": 0.0495, + "step": 1450 + }, + { + "epoch": 0.08571093107901843, + "grad_norm": 2.6156978607177734, + "learning_rate": 9.960499285101945e-05, + "loss": 0.0156, + "step": 1460 + }, + { + "epoch": 0.08629799225079253, + "grad_norm": 0.8362844586372375, + "learning_rate": 9.95991703682664e-05, + "loss": 0.0355, + "step": 1470 + }, + { + "epoch": 0.08688505342256664, + "grad_norm": 1.7795151472091675, + "learning_rate": 9.959330545930963e-05, + "loss": 0.0218, + "step": 1480 + }, + { + "epoch": 0.08747211459434073, + "grad_norm": 1.2182948589324951, + "learning_rate": 9.958739812916586e-05, + "loss": 0.0325, + "step": 1490 + }, + { + "epoch": 0.08805917576611483, + "grad_norm": 2.649080991744995, + "learning_rate": 9.958144838288814e-05, + "loss": 0.0214, + "step": 1500 + }, + { + "epoch": 0.08864623693788892, + "grad_norm": 2.225630521774292, + "learning_rate": 9.957545622556574e-05, + "loss": 0.0234, + "step": 1510 + }, + { + "epoch": 0.08923329810966303, + "grad_norm": 1.3121471405029297, + "learning_rate": 9.956942166232427e-05, + "loss": 0.0368, + "step": 1520 + }, + { + "epoch": 0.08982035928143713, + "grad_norm": 6.808068752288818, + "learning_rate": 9.956334469832556e-05, + "loss": 0.0569, + "step": 1530 + }, + { + "epoch": 0.09040742045321122, + "grad_norm": 2.119955062866211, + "learning_rate": 9.955722533876773e-05, + "loss": 0.0414, + "step": 1540 + }, + { + "epoch": 0.09099448162498533, + "grad_norm": 1.2916687726974487, + "learning_rate": 9.955106358888517e-05, + "loss": 0.031, + "step": 1550 + }, + { + "epoch": 0.09158154279675942, + "grad_norm": 2.2134711742401123, + "learning_rate": 9.954485945394856e-05, + "loss": 0.0469, + "step": 1560 + }, + { + "epoch": 0.09216860396853352, + "grad_norm": 1.7012152671813965, + "learning_rate": 9.953861293926474e-05, + "loss": 0.0271, + "step": 1570 + }, + { + "epoch": 0.09275566514030761, + "grad_norm": 2.5345096588134766, + "learning_rate": 9.95323240501769e-05, + "loss": 0.0338, + "step": 1580 + }, + { + "epoch": 0.09334272631208172, + "grad_norm": 1.7120459079742432, + "learning_rate": 9.952599279206444e-05, + "loss": 0.0317, + "step": 1590 + }, + { + "epoch": 0.09392978748385582, + "grad_norm": 0.7913612127304077, + "learning_rate": 9.951961917034299e-05, + "loss": 0.0419, + "step": 1600 + }, + { + "epoch": 0.09451684865562991, + "grad_norm": 1.9415456056594849, + "learning_rate": 9.951320319046442e-05, + "loss": 0.036, + "step": 1610 + }, + { + "epoch": 0.09510390982740402, + "grad_norm": 2.7447025775909424, + "learning_rate": 9.950674485791685e-05, + "loss": 0.0334, + "step": 1620 + }, + { + "epoch": 0.09569097099917812, + "grad_norm": 1.6320359706878662, + "learning_rate": 9.950024417822462e-05, + "loss": 0.0343, + "step": 1630 + }, + { + "epoch": 0.09627803217095221, + "grad_norm": 2.6058413982391357, + "learning_rate": 9.949370115694827e-05, + "loss": 0.052, + "step": 1640 + }, + { + "epoch": 0.09686509334272632, + "grad_norm": 1.936697244644165, + "learning_rate": 9.94871157996846e-05, + "loss": 0.0201, + "step": 1650 + }, + { + "epoch": 0.09745215451450041, + "grad_norm": 0.7421324253082275, + "learning_rate": 9.948048811206658e-05, + "loss": 0.0302, + "step": 1660 + }, + { + "epoch": 0.09803921568627451, + "grad_norm": 1.3068259954452515, + "learning_rate": 9.947381809976344e-05, + "loss": 0.0323, + "step": 1670 + }, + { + "epoch": 0.0986262768580486, + "grad_norm": 1.5505841970443726, + "learning_rate": 9.946710576848058e-05, + "loss": 0.027, + "step": 1680 + }, + { + "epoch": 0.09921333802982271, + "grad_norm": 2.2337491512298584, + "learning_rate": 9.946035112395958e-05, + "loss": 0.0259, + "step": 1690 + }, + { + "epoch": 0.0998003992015968, + "grad_norm": 1.8403325080871582, + "learning_rate": 9.945355417197824e-05, + "loss": 0.0245, + "step": 1700 + }, + { + "epoch": 0.1003874603733709, + "grad_norm": 3.5541958808898926, + "learning_rate": 9.944671491835056e-05, + "loss": 0.0268, + "step": 1710 + }, + { + "epoch": 0.10097452154514501, + "grad_norm": 2.1355576515197754, + "learning_rate": 9.943983336892669e-05, + "loss": 0.0351, + "step": 1720 + }, + { + "epoch": 0.1015615827169191, + "grad_norm": 1.5419174432754517, + "learning_rate": 9.9432909529593e-05, + "loss": 0.0297, + "step": 1730 + }, + { + "epoch": 0.1021486438886932, + "grad_norm": 1.242248773574829, + "learning_rate": 9.9425943406272e-05, + "loss": 0.0346, + "step": 1740 + }, + { + "epoch": 0.1027357050604673, + "grad_norm": 1.4329538345336914, + "learning_rate": 9.941893500492241e-05, + "loss": 0.0516, + "step": 1750 + }, + { + "epoch": 0.1033227662322414, + "grad_norm": 1.0378223657608032, + "learning_rate": 9.941188433153904e-05, + "loss": 0.0181, + "step": 1760 + }, + { + "epoch": 0.1039098274040155, + "grad_norm": 2.057999849319458, + "learning_rate": 9.940479139215293e-05, + "loss": 0.0407, + "step": 1770 + }, + { + "epoch": 0.10449688857578959, + "grad_norm": 1.731401801109314, + "learning_rate": 9.939765619283124e-05, + "loss": 0.0575, + "step": 1780 + }, + { + "epoch": 0.1050839497475637, + "grad_norm": 2.289888858795166, + "learning_rate": 9.93904787396773e-05, + "loss": 0.0222, + "step": 1790 + }, + { + "epoch": 0.1056710109193378, + "grad_norm": 3.2877893447875977, + "learning_rate": 9.938325903883055e-05, + "loss": 0.1163, + "step": 1800 + }, + { + "epoch": 0.10625807209111189, + "grad_norm": 0.9616751074790955, + "learning_rate": 9.937599709646661e-05, + "loss": 0.0686, + "step": 1810 + }, + { + "epoch": 0.106845133262886, + "grad_norm": 0.8145966529846191, + "learning_rate": 9.936869291879718e-05, + "loss": 0.0317, + "step": 1820 + }, + { + "epoch": 0.1074321944346601, + "grad_norm": 1.424609661102295, + "learning_rate": 9.936134651207015e-05, + "loss": 0.0287, + "step": 1830 + }, + { + "epoch": 0.10801925560643419, + "grad_norm": 0.43495574593544006, + "learning_rate": 9.935395788256947e-05, + "loss": 0.0215, + "step": 1840 + }, + { + "epoch": 0.10860631677820828, + "grad_norm": 2.1051337718963623, + "learning_rate": 9.934652703661527e-05, + "loss": 0.0321, + "step": 1850 + }, + { + "epoch": 0.10919337794998239, + "grad_norm": 1.501328706741333, + "learning_rate": 9.933905398056372e-05, + "loss": 0.0225, + "step": 1860 + }, + { + "epoch": 0.10978043912175649, + "grad_norm": 2.1189637184143066, + "learning_rate": 9.933153872080714e-05, + "loss": 0.0581, + "step": 1870 + }, + { + "epoch": 0.11036750029353058, + "grad_norm": 2.349695920944214, + "learning_rate": 9.932398126377396e-05, + "loss": 0.0331, + "step": 1880 + }, + { + "epoch": 0.11095456146530469, + "grad_norm": 2.785264015197754, + "learning_rate": 9.931638161592867e-05, + "loss": 0.0227, + "step": 1890 + }, + { + "epoch": 0.11154162263707879, + "grad_norm": 1.9872111082077026, + "learning_rate": 9.930873978377187e-05, + "loss": 0.0247, + "step": 1900 + }, + { + "epoch": 0.11212868380885288, + "grad_norm": 2.1452584266662598, + "learning_rate": 9.930105577384026e-05, + "loss": 0.0475, + "step": 1910 + }, + { + "epoch": 0.11271574498062698, + "grad_norm": 3.118792772293091, + "learning_rate": 9.929332959270659e-05, + "loss": 0.0546, + "step": 1920 + }, + { + "epoch": 0.11330280615240108, + "grad_norm": 1.63677978515625, + "learning_rate": 9.928556124697967e-05, + "loss": 0.042, + "step": 1930 + }, + { + "epoch": 0.11388986732417518, + "grad_norm": 4.233394622802734, + "learning_rate": 9.927775074330441e-05, + "loss": 0.0424, + "step": 1940 + }, + { + "epoch": 0.11447692849594927, + "grad_norm": 1.0141804218292236, + "learning_rate": 9.926989808836178e-05, + "loss": 0.0529, + "step": 1950 + }, + { + "epoch": 0.11506398966772338, + "grad_norm": 1.6904869079589844, + "learning_rate": 9.926200328886878e-05, + "loss": 0.0296, + "step": 1960 + }, + { + "epoch": 0.11565105083949748, + "grad_norm": 3.563488721847534, + "learning_rate": 9.92540663515785e-05, + "loss": 0.0413, + "step": 1970 + }, + { + "epoch": 0.11623811201127157, + "grad_norm": 2.9499549865722656, + "learning_rate": 9.924608728328001e-05, + "loss": 0.0411, + "step": 1980 + }, + { + "epoch": 0.11682517318304568, + "grad_norm": 0.8236364722251892, + "learning_rate": 9.923806609079847e-05, + "loss": 0.039, + "step": 1990 + }, + { + "epoch": 0.11741223435481977, + "grad_norm": 4.066677570343018, + "learning_rate": 9.923000278099508e-05, + "loss": 0.0514, + "step": 2000 + }, + { + "epoch": 0.11799929552659387, + "grad_norm": 5.0400390625, + "learning_rate": 9.922189736076701e-05, + "loss": 0.057, + "step": 2010 + }, + { + "epoch": 0.11858635669836796, + "grad_norm": 2.4912304878234863, + "learning_rate": 9.921374983704752e-05, + "loss": 0.0258, + "step": 2020 + }, + { + "epoch": 0.11917341787014207, + "grad_norm": 2.3315348625183105, + "learning_rate": 9.92055602168058e-05, + "loss": 0.0773, + "step": 2030 + }, + { + "epoch": 0.11976047904191617, + "grad_norm": 2.5812909603118896, + "learning_rate": 9.919732850704716e-05, + "loss": 0.0425, + "step": 2040 + }, + { + "epoch": 0.12034754021369026, + "grad_norm": 1.1976637840270996, + "learning_rate": 9.918905471481281e-05, + "loss": 0.0487, + "step": 2050 + }, + { + "epoch": 0.12093460138546437, + "grad_norm": 1.7439115047454834, + "learning_rate": 9.918073884718e-05, + "loss": 0.0425, + "step": 2060 + }, + { + "epoch": 0.12152166255723847, + "grad_norm": 2.745136022567749, + "learning_rate": 9.917238091126198e-05, + "loss": 0.0374, + "step": 2070 + }, + { + "epoch": 0.12210872372901256, + "grad_norm": 4.099924564361572, + "learning_rate": 9.916398091420797e-05, + "loss": 0.0504, + "step": 2080 + }, + { + "epoch": 0.12269578490078666, + "grad_norm": 2.8515045642852783, + "learning_rate": 9.915553886320317e-05, + "loss": 0.0246, + "step": 2090 + }, + { + "epoch": 0.12328284607256076, + "grad_norm": 1.8732186555862427, + "learning_rate": 9.914705476546875e-05, + "loss": 0.0653, + "step": 2100 + }, + { + "epoch": 0.12386990724433486, + "grad_norm": 2.660759925842285, + "learning_rate": 9.913852862826185e-05, + "loss": 0.0244, + "step": 2110 + }, + { + "epoch": 0.12445696841610895, + "grad_norm": 2.177604913711548, + "learning_rate": 9.912996045887556e-05, + "loss": 0.0488, + "step": 2120 + }, + { + "epoch": 0.12504402958788305, + "grad_norm": 1.5271365642547607, + "learning_rate": 9.912135026463895e-05, + "loss": 0.0495, + "step": 2130 + }, + { + "epoch": 0.12563109075965714, + "grad_norm": 0.54610276222229, + "learning_rate": 9.911269805291699e-05, + "loss": 0.0471, + "step": 2140 + }, + { + "epoch": 0.12621815193143127, + "grad_norm": 0.24762262403964996, + "learning_rate": 9.910400383111067e-05, + "loss": 0.029, + "step": 2150 + }, + { + "epoch": 0.12680521310320536, + "grad_norm": 1.192949891090393, + "learning_rate": 9.909526760665682e-05, + "loss": 0.0397, + "step": 2160 + }, + { + "epoch": 0.12739227427497946, + "grad_norm": 1.758433222770691, + "learning_rate": 9.908648938702825e-05, + "loss": 0.0719, + "step": 2170 + }, + { + "epoch": 0.12797933544675355, + "grad_norm": 1.0003618001937866, + "learning_rate": 9.90776691797337e-05, + "loss": 0.0394, + "step": 2180 + }, + { + "epoch": 0.12856639661852765, + "grad_norm": 0.9429585337638855, + "learning_rate": 9.90688069923178e-05, + "loss": 0.0471, + "step": 2190 + }, + { + "epoch": 0.12915345779030174, + "grad_norm": 1.6162910461425781, + "learning_rate": 9.90599028323611e-05, + "loss": 0.0622, + "step": 2200 + }, + { + "epoch": 0.12974051896207583, + "grad_norm": 2.433619499206543, + "learning_rate": 9.905095670748005e-05, + "loss": 0.047, + "step": 2210 + }, + { + "epoch": 0.13032758013384996, + "grad_norm": 2.3690948486328125, + "learning_rate": 9.904196862532702e-05, + "loss": 0.0322, + "step": 2220 + }, + { + "epoch": 0.13091464130562405, + "grad_norm": 3.2311367988586426, + "learning_rate": 9.903293859359023e-05, + "loss": 0.051, + "step": 2230 + }, + { + "epoch": 0.13150170247739815, + "grad_norm": 2.1195015907287598, + "learning_rate": 9.902386661999379e-05, + "loss": 0.034, + "step": 2240 + }, + { + "epoch": 0.13208876364917224, + "grad_norm": 1.6307891607284546, + "learning_rate": 9.901475271229772e-05, + "loss": 0.0544, + "step": 2250 + }, + { + "epoch": 0.13267582482094634, + "grad_norm": 1.770676612854004, + "learning_rate": 9.900559687829786e-05, + "loss": 0.0434, + "step": 2260 + }, + { + "epoch": 0.13326288599272043, + "grad_norm": 2.920513391494751, + "learning_rate": 9.899639912582596e-05, + "loss": 0.0292, + "step": 2270 + }, + { + "epoch": 0.13384994716449455, + "grad_norm": 1.8063915967941284, + "learning_rate": 9.89871594627496e-05, + "loss": 0.0362, + "step": 2280 + }, + { + "epoch": 0.13443700833626865, + "grad_norm": 3.2409045696258545, + "learning_rate": 9.897787789697221e-05, + "loss": 0.0651, + "step": 2290 + }, + { + "epoch": 0.13502406950804274, + "grad_norm": 0.9920338988304138, + "learning_rate": 9.896855443643308e-05, + "loss": 0.0244, + "step": 2300 + }, + { + "epoch": 0.13561113067981684, + "grad_norm": 2.6896822452545166, + "learning_rate": 9.895918908910731e-05, + "loss": 0.0348, + "step": 2310 + }, + { + "epoch": 0.13619819185159093, + "grad_norm": 3.40159273147583, + "learning_rate": 9.894978186300585e-05, + "loss": 0.0359, + "step": 2320 + }, + { + "epoch": 0.13678525302336503, + "grad_norm": 2.5290367603302, + "learning_rate": 9.894033276617547e-05, + "loss": 0.0472, + "step": 2330 + }, + { + "epoch": 0.13737231419513912, + "grad_norm": 1.901503562927246, + "learning_rate": 9.893084180669873e-05, + "loss": 0.0437, + "step": 2340 + }, + { + "epoch": 0.13795937536691324, + "grad_norm": 1.9427504539489746, + "learning_rate": 9.892130899269405e-05, + "loss": 0.0403, + "step": 2350 + }, + { + "epoch": 0.13854643653868734, + "grad_norm": 0.3844375014305115, + "learning_rate": 9.891173433231559e-05, + "loss": 0.0285, + "step": 2360 + }, + { + "epoch": 0.13913349771046143, + "grad_norm": 2.4379961490631104, + "learning_rate": 9.890211783375338e-05, + "loss": 0.0438, + "step": 2370 + }, + { + "epoch": 0.13972055888223553, + "grad_norm": 1.542114019393921, + "learning_rate": 9.889245950523315e-05, + "loss": 0.0357, + "step": 2380 + }, + { + "epoch": 0.14030762005400962, + "grad_norm": 3.4859018325805664, + "learning_rate": 9.888275935501647e-05, + "loss": 0.0497, + "step": 2390 + }, + { + "epoch": 0.14089468122578372, + "grad_norm": 0.8069745898246765, + "learning_rate": 9.887301739140066e-05, + "loss": 0.0365, + "step": 2400 + }, + { + "epoch": 0.1414817423975578, + "grad_norm": 6.08782434463501, + "learning_rate": 9.886323362271882e-05, + "loss": 0.0764, + "step": 2410 + }, + { + "epoch": 0.14206880356933194, + "grad_norm": 2.0983567237854004, + "learning_rate": 9.88534080573398e-05, + "loss": 0.0562, + "step": 2420 + }, + { + "epoch": 0.14265586474110603, + "grad_norm": 0.8409938812255859, + "learning_rate": 9.884354070366822e-05, + "loss": 0.0368, + "step": 2430 + }, + { + "epoch": 0.14324292591288013, + "grad_norm": 2.17271089553833, + "learning_rate": 9.883363157014442e-05, + "loss": 0.024, + "step": 2440 + }, + { + "epoch": 0.14382998708465422, + "grad_norm": 1.280503273010254, + "learning_rate": 9.882368066524448e-05, + "loss": 0.0247, + "step": 2450 + }, + { + "epoch": 0.14441704825642832, + "grad_norm": 2.7436742782592773, + "learning_rate": 9.881368799748021e-05, + "loss": 0.0458, + "step": 2460 + }, + { + "epoch": 0.1450041094282024, + "grad_norm": 2.1065752506256104, + "learning_rate": 9.880365357539917e-05, + "loss": 0.0557, + "step": 2470 + }, + { + "epoch": 0.1455911705999765, + "grad_norm": 2.5970640182495117, + "learning_rate": 9.879357740758462e-05, + "loss": 0.0661, + "step": 2480 + }, + { + "epoch": 0.14617823177175063, + "grad_norm": 2.9739792346954346, + "learning_rate": 9.878345950265552e-05, + "loss": 0.0725, + "step": 2490 + }, + { + "epoch": 0.14676529294352472, + "grad_norm": 3.2631781101226807, + "learning_rate": 9.877329986926653e-05, + "loss": 0.043, + "step": 2500 + }, + { + "epoch": 0.14735235411529882, + "grad_norm": 2.616386651992798, + "learning_rate": 9.876309851610801e-05, + "loss": 0.0345, + "step": 2510 + }, + { + "epoch": 0.1479394152870729, + "grad_norm": 2.065317153930664, + "learning_rate": 9.875285545190603e-05, + "loss": 0.0458, + "step": 2520 + }, + { + "epoch": 0.148526476458847, + "grad_norm": 0.7852377891540527, + "learning_rate": 9.874257068542227e-05, + "loss": 0.0303, + "step": 2530 + }, + { + "epoch": 0.1491135376306211, + "grad_norm": 1.7618194818496704, + "learning_rate": 9.873224422545417e-05, + "loss": 0.0558, + "step": 2540 + }, + { + "epoch": 0.1497005988023952, + "grad_norm": 1.0067861080169678, + "learning_rate": 9.872187608083478e-05, + "loss": 0.0234, + "step": 2550 + }, + { + "epoch": 0.15028765997416932, + "grad_norm": 5.33658504486084, + "learning_rate": 9.871146626043282e-05, + "loss": 0.0451, + "step": 2560 + }, + { + "epoch": 0.1508747211459434, + "grad_norm": 1.3059951066970825, + "learning_rate": 9.870101477315263e-05, + "loss": 0.0463, + "step": 2570 + }, + { + "epoch": 0.1514617823177175, + "grad_norm": 1.686173677444458, + "learning_rate": 9.869052162793424e-05, + "loss": 0.0425, + "step": 2580 + }, + { + "epoch": 0.1520488434894916, + "grad_norm": 2.3160104751586914, + "learning_rate": 9.867998683375329e-05, + "loss": 0.0555, + "step": 2590 + }, + { + "epoch": 0.1526359046612657, + "grad_norm": 0.8404874205589294, + "learning_rate": 9.866941039962104e-05, + "loss": 0.0361, + "step": 2600 + }, + { + "epoch": 0.1532229658330398, + "grad_norm": 2.111457109451294, + "learning_rate": 9.865879233458438e-05, + "loss": 0.0263, + "step": 2610 + }, + { + "epoch": 0.15381002700481392, + "grad_norm": 3.2856898307800293, + "learning_rate": 9.86481326477258e-05, + "loss": 0.0476, + "step": 2620 + }, + { + "epoch": 0.154397088176588, + "grad_norm": 0.6448665857315063, + "learning_rate": 9.863743134816342e-05, + "loss": 0.0428, + "step": 2630 + }, + { + "epoch": 0.1549841493483621, + "grad_norm": 2.341824531555176, + "learning_rate": 9.862668844505087e-05, + "loss": 0.0518, + "step": 2640 + }, + { + "epoch": 0.1555712105201362, + "grad_norm": 3.485743761062622, + "learning_rate": 9.86159039475775e-05, + "loss": 0.0454, + "step": 2650 + }, + { + "epoch": 0.1561582716919103, + "grad_norm": 1.6375038623809814, + "learning_rate": 9.86050778649681e-05, + "loss": 0.0438, + "step": 2660 + }, + { + "epoch": 0.1567453328636844, + "grad_norm": 4.4719672203063965, + "learning_rate": 9.859421020648317e-05, + "loss": 0.0361, + "step": 2670 + }, + { + "epoch": 0.15733239403545848, + "grad_norm": 1.005717396736145, + "learning_rate": 9.858330098141866e-05, + "loss": 0.0367, + "step": 2680 + }, + { + "epoch": 0.1579194552072326, + "grad_norm": 1.5214030742645264, + "learning_rate": 9.857235019910611e-05, + "loss": 0.0365, + "step": 2690 + }, + { + "epoch": 0.1585065163790067, + "grad_norm": 2.988581895828247, + "learning_rate": 9.856135786891265e-05, + "loss": 0.0521, + "step": 2700 + }, + { + "epoch": 0.1590935775507808, + "grad_norm": 2.4597716331481934, + "learning_rate": 9.855032400024089e-05, + "loss": 0.0311, + "step": 2710 + }, + { + "epoch": 0.1596806387225549, + "grad_norm": 1.8115123510360718, + "learning_rate": 9.853924860252898e-05, + "loss": 0.0458, + "step": 2720 + }, + { + "epoch": 0.16026769989432899, + "grad_norm": 2.618330955505371, + "learning_rate": 9.852813168525064e-05, + "loss": 0.0295, + "step": 2730 + }, + { + "epoch": 0.16085476106610308, + "grad_norm": 1.6817182302474976, + "learning_rate": 9.851697325791505e-05, + "loss": 0.03, + "step": 2740 + }, + { + "epoch": 0.16144182223787718, + "grad_norm": 1.044686198234558, + "learning_rate": 9.850577333006693e-05, + "loss": 0.0393, + "step": 2750 + }, + { + "epoch": 0.1620288834096513, + "grad_norm": 4.514500141143799, + "learning_rate": 9.84945319112865e-05, + "loss": 0.0397, + "step": 2760 + }, + { + "epoch": 0.1626159445814254, + "grad_norm": 1.7709953784942627, + "learning_rate": 9.848324901118943e-05, + "loss": 0.0324, + "step": 2770 + }, + { + "epoch": 0.1632030057531995, + "grad_norm": 3.2024428844451904, + "learning_rate": 9.847192463942694e-05, + "loss": 0.0369, + "step": 2780 + }, + { + "epoch": 0.16379006692497358, + "grad_norm": 2.4505691528320312, + "learning_rate": 9.846055880568566e-05, + "loss": 0.0494, + "step": 2790 + }, + { + "epoch": 0.16437712809674768, + "grad_norm": 0.6586551666259766, + "learning_rate": 9.844915151968773e-05, + "loss": 0.0284, + "step": 2800 + }, + { + "epoch": 0.16496418926852177, + "grad_norm": 1.8783758878707886, + "learning_rate": 9.843770279119069e-05, + "loss": 0.0451, + "step": 2810 + }, + { + "epoch": 0.16555125044029587, + "grad_norm": 1.4282265901565552, + "learning_rate": 9.842621262998761e-05, + "loss": 0.0307, + "step": 2820 + }, + { + "epoch": 0.16613831161207, + "grad_norm": 1.9189605712890625, + "learning_rate": 9.841468104590695e-05, + "loss": 0.0483, + "step": 2830 + }, + { + "epoch": 0.16672537278384408, + "grad_norm": 1.8889552354812622, + "learning_rate": 9.840310804881261e-05, + "loss": 0.066, + "step": 2840 + }, + { + "epoch": 0.16731243395561818, + "grad_norm": 3.8601179122924805, + "learning_rate": 9.839149364860389e-05, + "loss": 0.0332, + "step": 2850 + }, + { + "epoch": 0.16789949512739227, + "grad_norm": 1.879103660583496, + "learning_rate": 9.837983785521559e-05, + "loss": 0.056, + "step": 2860 + }, + { + "epoch": 0.16848655629916637, + "grad_norm": 2.205148696899414, + "learning_rate": 9.83681406786178e-05, + "loss": 0.0375, + "step": 2870 + }, + { + "epoch": 0.16907361747094046, + "grad_norm": 1.0379294157028198, + "learning_rate": 9.835640212881608e-05, + "loss": 0.0265, + "step": 2880 + }, + { + "epoch": 0.16966067864271456, + "grad_norm": 3.219978094100952, + "learning_rate": 9.834462221585139e-05, + "loss": 0.0234, + "step": 2890 + }, + { + "epoch": 0.17024773981448868, + "grad_norm": 3.693683624267578, + "learning_rate": 9.833280094980002e-05, + "loss": 0.0869, + "step": 2900 + }, + { + "epoch": 0.17083480098626277, + "grad_norm": 2.207987070083618, + "learning_rate": 9.832093834077367e-05, + "loss": 0.045, + "step": 2910 + }, + { + "epoch": 0.17142186215803687, + "grad_norm": 0.669643759727478, + "learning_rate": 9.83090343989194e-05, + "loss": 0.0302, + "step": 2920 + }, + { + "epoch": 0.17200892332981096, + "grad_norm": 2.841738700866699, + "learning_rate": 9.829708913441962e-05, + "loss": 0.0455, + "step": 2930 + }, + { + "epoch": 0.17259598450158506, + "grad_norm": 2.0086352825164795, + "learning_rate": 9.828510255749208e-05, + "loss": 0.0255, + "step": 2940 + }, + { + "epoch": 0.17318304567335915, + "grad_norm": 1.1377476453781128, + "learning_rate": 9.827307467838987e-05, + "loss": 0.0432, + "step": 2950 + }, + { + "epoch": 0.17377010684513328, + "grad_norm": 2.0405306816101074, + "learning_rate": 9.826100550740143e-05, + "loss": 0.0498, + "step": 2960 + }, + { + "epoch": 0.17435716801690737, + "grad_norm": 1.9128247499465942, + "learning_rate": 9.824889505485048e-05, + "loss": 0.0223, + "step": 2970 + }, + { + "epoch": 0.17494422918868147, + "grad_norm": 2.2797882556915283, + "learning_rate": 9.823674333109608e-05, + "loss": 0.0663, + "step": 2980 + }, + { + "epoch": 0.17553129036045556, + "grad_norm": 1.8146817684173584, + "learning_rate": 9.82245503465326e-05, + "loss": 0.0255, + "step": 2990 + }, + { + "epoch": 0.17611835153222966, + "grad_norm": 0.6516047716140747, + "learning_rate": 9.821231611158969e-05, + "loss": 0.0304, + "step": 3000 + }, + { + "epoch": 0.17611835153222966, + "eval_loss": 0.4754152297973633, + "eval_runtime": 269.6361, + "eval_samples_per_second": 3.505, + "eval_steps_per_second": 3.505, + "step": 3000 + }, + { + "epoch": 0.17670541270400375, + "grad_norm": 0.47869399189949036, + "learning_rate": 9.820004063673228e-05, + "loss": 0.033, + "step": 3010 + }, + { + "epoch": 0.17729247387577785, + "grad_norm": 2.7709405422210693, + "learning_rate": 9.818772393246058e-05, + "loss": 0.0433, + "step": 3020 + }, + { + "epoch": 0.17787953504755197, + "grad_norm": 0.6569466590881348, + "learning_rate": 9.817536600931007e-05, + "loss": 0.0356, + "step": 3030 + }, + { + "epoch": 0.17846659621932606, + "grad_norm": 2.474756956100464, + "learning_rate": 9.81629668778515e-05, + "loss": 0.0626, + "step": 3040 + }, + { + "epoch": 0.17905365739110016, + "grad_norm": 3.932004451751709, + "learning_rate": 9.815052654869084e-05, + "loss": 0.0589, + "step": 3050 + }, + { + "epoch": 0.17964071856287425, + "grad_norm": 1.8189294338226318, + "learning_rate": 9.813804503246932e-05, + "loss": 0.0407, + "step": 3060 + }, + { + "epoch": 0.18022777973464835, + "grad_norm": 1.9111417531967163, + "learning_rate": 9.812552233986338e-05, + "loss": 0.0389, + "step": 3070 + }, + { + "epoch": 0.18081484090642244, + "grad_norm": 3.685927629470825, + "learning_rate": 9.811295848158472e-05, + "loss": 0.0425, + "step": 3080 + }, + { + "epoch": 0.18140190207819654, + "grad_norm": 2.441706895828247, + "learning_rate": 9.810035346838023e-05, + "loss": 0.0436, + "step": 3090 + }, + { + "epoch": 0.18198896324997066, + "grad_norm": 1.8486565351486206, + "learning_rate": 9.8087707311032e-05, + "loss": 0.05, + "step": 3100 + }, + { + "epoch": 0.18257602442174475, + "grad_norm": 0.3728896379470825, + "learning_rate": 9.807502002035729e-05, + "loss": 0.0323, + "step": 3110 + }, + { + "epoch": 0.18316308559351885, + "grad_norm": 1.8062225580215454, + "learning_rate": 9.80622916072086e-05, + "loss": 0.0199, + "step": 3120 + }, + { + "epoch": 0.18375014676529294, + "grad_norm": 0.7379009127616882, + "learning_rate": 9.804952208247358e-05, + "loss": 0.0365, + "step": 3130 + }, + { + "epoch": 0.18433720793706704, + "grad_norm": 2.159383773803711, + "learning_rate": 9.803671145707502e-05, + "loss": 0.0508, + "step": 3140 + }, + { + "epoch": 0.18492426910884113, + "grad_norm": 1.8557137250900269, + "learning_rate": 9.80238597419709e-05, + "loss": 0.0744, + "step": 3150 + }, + { + "epoch": 0.18551133028061523, + "grad_norm": 0.7379667162895203, + "learning_rate": 9.801096694815435e-05, + "loss": 0.0436, + "step": 3160 + }, + { + "epoch": 0.18609839145238935, + "grad_norm": 2.2723872661590576, + "learning_rate": 9.799803308665362e-05, + "loss": 0.0698, + "step": 3170 + }, + { + "epoch": 0.18668545262416344, + "grad_norm": 2.0510456562042236, + "learning_rate": 9.798505816853208e-05, + "loss": 0.0361, + "step": 3180 + }, + { + "epoch": 0.18727251379593754, + "grad_norm": 1.9840871095657349, + "learning_rate": 9.797204220488823e-05, + "loss": 0.0456, + "step": 3190 + }, + { + "epoch": 0.18785957496771163, + "grad_norm": 0.5295336246490479, + "learning_rate": 9.795898520685569e-05, + "loss": 0.069, + "step": 3200 + }, + { + "epoch": 0.18844663613948573, + "grad_norm": 1.9793596267700195, + "learning_rate": 9.794588718560319e-05, + "loss": 0.076, + "step": 3210 + }, + { + "epoch": 0.18903369731125982, + "grad_norm": 1.951541781425476, + "learning_rate": 9.793274815233451e-05, + "loss": 0.0556, + "step": 3220 + }, + { + "epoch": 0.18962075848303392, + "grad_norm": 2.235807418823242, + "learning_rate": 9.791956811828855e-05, + "loss": 0.0489, + "step": 3230 + }, + { + "epoch": 0.19020781965480804, + "grad_norm": 2.71808123588562, + "learning_rate": 9.790634709473924e-05, + "loss": 0.0408, + "step": 3240 + }, + { + "epoch": 0.19079488082658214, + "grad_norm": 1.706302285194397, + "learning_rate": 9.789308509299562e-05, + "loss": 0.0242, + "step": 3250 + }, + { + "epoch": 0.19138194199835623, + "grad_norm": 2.558588981628418, + "learning_rate": 9.787978212440176e-05, + "loss": 0.0296, + "step": 3260 + }, + { + "epoch": 0.19196900317013033, + "grad_norm": 1.5892531871795654, + "learning_rate": 9.786643820033674e-05, + "loss": 0.0527, + "step": 3270 + }, + { + "epoch": 0.19255606434190442, + "grad_norm": 2.68218731880188, + "learning_rate": 9.785305333221474e-05, + "loss": 0.0604, + "step": 3280 + }, + { + "epoch": 0.19314312551367852, + "grad_norm": 2.679905652999878, + "learning_rate": 9.78396275314849e-05, + "loss": 0.0211, + "step": 3290 + }, + { + "epoch": 0.19373018668545264, + "grad_norm": 1.5836975574493408, + "learning_rate": 9.782616080963143e-05, + "loss": 0.0665, + "step": 3300 + }, + { + "epoch": 0.19431724785722673, + "grad_norm": 1.1025676727294922, + "learning_rate": 9.781265317817347e-05, + "loss": 0.039, + "step": 3310 + }, + { + "epoch": 0.19490430902900083, + "grad_norm": 2.7383909225463867, + "learning_rate": 9.779910464866523e-05, + "loss": 0.0612, + "step": 3320 + }, + { + "epoch": 0.19549137020077492, + "grad_norm": 1.8712137937545776, + "learning_rate": 9.778551523269586e-05, + "loss": 0.0458, + "step": 3330 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 0.3346378803253174, + "learning_rate": 9.777188494188948e-05, + "loss": 0.0316, + "step": 3340 + }, + { + "epoch": 0.1966654925443231, + "grad_norm": 3.2127461433410645, + "learning_rate": 9.775821378790519e-05, + "loss": 0.0396, + "step": 3350 + }, + { + "epoch": 0.1972525537160972, + "grad_norm": 2.9048569202423096, + "learning_rate": 9.774450178243706e-05, + "loss": 0.0397, + "step": 3360 + }, + { + "epoch": 0.19783961488787133, + "grad_norm": 2.8324990272521973, + "learning_rate": 9.773074893721407e-05, + "loss": 0.03, + "step": 3370 + }, + { + "epoch": 0.19842667605964542, + "grad_norm": 3.4215381145477295, + "learning_rate": 9.771695526400013e-05, + "loss": 0.0548, + "step": 3380 + }, + { + "epoch": 0.19901373723141952, + "grad_norm": 2.7100284099578857, + "learning_rate": 9.770312077459411e-05, + "loss": 0.0399, + "step": 3390 + }, + { + "epoch": 0.1996007984031936, + "grad_norm": 1.099760890007019, + "learning_rate": 9.768924548082979e-05, + "loss": 0.0456, + "step": 3400 + }, + { + "epoch": 0.2001878595749677, + "grad_norm": 5.312976360321045, + "learning_rate": 9.76753293945758e-05, + "loss": 0.0423, + "step": 3410 + }, + { + "epoch": 0.2007749207467418, + "grad_norm": 2.148909091949463, + "learning_rate": 9.766137252773572e-05, + "loss": 0.0388, + "step": 3420 + }, + { + "epoch": 0.2013619819185159, + "grad_norm": 2.3653364181518555, + "learning_rate": 9.764737489224799e-05, + "loss": 0.0378, + "step": 3430 + }, + { + "epoch": 0.20194904309029002, + "grad_norm": 3.389256000518799, + "learning_rate": 9.763333650008593e-05, + "loss": 0.0486, + "step": 3440 + }, + { + "epoch": 0.20253610426206411, + "grad_norm": 1.1765735149383545, + "learning_rate": 9.76192573632577e-05, + "loss": 0.0774, + "step": 3450 + }, + { + "epoch": 0.2031231654338382, + "grad_norm": 3.8393447399139404, + "learning_rate": 9.760513749380635e-05, + "loss": 0.0538, + "step": 3460 + }, + { + "epoch": 0.2037102266056123, + "grad_norm": 2.3688712120056152, + "learning_rate": 9.759097690380976e-05, + "loss": 0.0289, + "step": 3470 + }, + { + "epoch": 0.2042972877773864, + "grad_norm": 0.3408827781677246, + "learning_rate": 9.757677560538061e-05, + "loss": 0.0424, + "step": 3480 + }, + { + "epoch": 0.2048843489491605, + "grad_norm": 1.3343703746795654, + "learning_rate": 9.756253361066643e-05, + "loss": 0.0301, + "step": 3490 + }, + { + "epoch": 0.2054714101209346, + "grad_norm": 1.2232662439346313, + "learning_rate": 9.754825093184958e-05, + "loss": 0.0203, + "step": 3500 + }, + { + "epoch": 0.2060584712927087, + "grad_norm": 2.336533308029175, + "learning_rate": 9.753392758114718e-05, + "loss": 0.0402, + "step": 3510 + }, + { + "epoch": 0.2066455324644828, + "grad_norm": 2.738333225250244, + "learning_rate": 9.751956357081115e-05, + "loss": 0.0438, + "step": 3520 + }, + { + "epoch": 0.2072325936362569, + "grad_norm": 1.7353787422180176, + "learning_rate": 9.750515891312819e-05, + "loss": 0.032, + "step": 3530 + }, + { + "epoch": 0.207819654808031, + "grad_norm": 1.526991367340088, + "learning_rate": 9.749071362041981e-05, + "loss": 0.0356, + "step": 3540 + }, + { + "epoch": 0.2084067159798051, + "grad_norm": 3.5745511054992676, + "learning_rate": 9.747622770504221e-05, + "loss": 0.0425, + "step": 3550 + }, + { + "epoch": 0.20899377715157919, + "grad_norm": 1.9567959308624268, + "learning_rate": 9.746170117938638e-05, + "loss": 0.0604, + "step": 3560 + }, + { + "epoch": 0.20958083832335328, + "grad_norm": 2.5341362953186035, + "learning_rate": 9.744713405587804e-05, + "loss": 0.039, + "step": 3570 + }, + { + "epoch": 0.2101678994951274, + "grad_norm": 4.469109058380127, + "learning_rate": 9.743252634697767e-05, + "loss": 0.044, + "step": 3580 + }, + { + "epoch": 0.2107549606669015, + "grad_norm": 3.514477014541626, + "learning_rate": 9.741787806518035e-05, + "loss": 0.0514, + "step": 3590 + }, + { + "epoch": 0.2113420218386756, + "grad_norm": 4.614218235015869, + "learning_rate": 9.740318922301602e-05, + "loss": 0.0357, + "step": 3600 + }, + { + "epoch": 0.2119290830104497, + "grad_norm": 4.051399230957031, + "learning_rate": 9.738845983304921e-05, + "loss": 0.0431, + "step": 3610 + }, + { + "epoch": 0.21251614418222378, + "grad_norm": 0.22102586925029755, + "learning_rate": 9.737368990787916e-05, + "loss": 0.058, + "step": 3620 + }, + { + "epoch": 0.21310320535399788, + "grad_norm": 2.6751372814178467, + "learning_rate": 9.735887946013982e-05, + "loss": 0.0424, + "step": 3630 + }, + { + "epoch": 0.213690266525772, + "grad_norm": 2.331186532974243, + "learning_rate": 9.734402850249973e-05, + "loss": 0.0388, + "step": 3640 + }, + { + "epoch": 0.2142773276975461, + "grad_norm": 0.5822759866714478, + "learning_rate": 9.732913704766216e-05, + "loss": 0.0405, + "step": 3650 + }, + { + "epoch": 0.2148643888693202, + "grad_norm": 1.9439653158187866, + "learning_rate": 9.731420510836494e-05, + "loss": 0.0445, + "step": 3660 + }, + { + "epoch": 0.21545145004109428, + "grad_norm": 4.179188251495361, + "learning_rate": 9.729923269738062e-05, + "loss": 0.0634, + "step": 3670 + }, + { + "epoch": 0.21603851121286838, + "grad_norm": 4.009206771850586, + "learning_rate": 9.728421982751628e-05, + "loss": 0.0419, + "step": 3680 + }, + { + "epoch": 0.21662557238464247, + "grad_norm": 2.237543821334839, + "learning_rate": 9.726916651161367e-05, + "loss": 0.0254, + "step": 3690 + }, + { + "epoch": 0.21721263355641657, + "grad_norm": 2.975160837173462, + "learning_rate": 9.725407276254909e-05, + "loss": 0.0286, + "step": 3700 + }, + { + "epoch": 0.2177996947281907, + "grad_norm": 1.5479967594146729, + "learning_rate": 9.723893859323348e-05, + "loss": 0.0326, + "step": 3710 + }, + { + "epoch": 0.21838675589996479, + "grad_norm": 1.7621835470199585, + "learning_rate": 9.722376401661233e-05, + "loss": 0.0437, + "step": 3720 + }, + { + "epoch": 0.21897381707173888, + "grad_norm": 2.332693099975586, + "learning_rate": 9.720854904566566e-05, + "loss": 0.0613, + "step": 3730 + }, + { + "epoch": 0.21956087824351297, + "grad_norm": 2.5728635787963867, + "learning_rate": 9.71932936934081e-05, + "loss": 0.0354, + "step": 3740 + }, + { + "epoch": 0.22014793941528707, + "grad_norm": 0.8831126093864441, + "learning_rate": 9.717799797288877e-05, + "loss": 0.0313, + "step": 3750 + }, + { + "epoch": 0.22073500058706116, + "grad_norm": 1.9385029077529907, + "learning_rate": 9.716266189719136e-05, + "loss": 0.0508, + "step": 3760 + }, + { + "epoch": 0.22132206175883526, + "grad_norm": 1.5319123268127441, + "learning_rate": 9.714728547943405e-05, + "loss": 0.0689, + "step": 3770 + }, + { + "epoch": 0.22190912293060938, + "grad_norm": 1.2483266592025757, + "learning_rate": 9.713186873276955e-05, + "loss": 0.0296, + "step": 3780 + }, + { + "epoch": 0.22249618410238348, + "grad_norm": 2.23776912689209, + "learning_rate": 9.711641167038506e-05, + "loss": 0.064, + "step": 3790 + }, + { + "epoch": 0.22308324527415757, + "grad_norm": 5.403620719909668, + "learning_rate": 9.710091430550224e-05, + "loss": 0.0404, + "step": 3800 + }, + { + "epoch": 0.22367030644593167, + "grad_norm": 2.370596170425415, + "learning_rate": 9.708537665137727e-05, + "loss": 0.0396, + "step": 3810 + }, + { + "epoch": 0.22425736761770576, + "grad_norm": 1.913191318511963, + "learning_rate": 9.706979872130077e-05, + "loss": 0.0435, + "step": 3820 + }, + { + "epoch": 0.22484442878947986, + "grad_norm": 1.9024417400360107, + "learning_rate": 9.70541805285978e-05, + "loss": 0.0395, + "step": 3830 + }, + { + "epoch": 0.22543148996125395, + "grad_norm": 2.4339756965637207, + "learning_rate": 9.703852208662786e-05, + "loss": 0.0526, + "step": 3840 + }, + { + "epoch": 0.22601855113302807, + "grad_norm": 1.6247811317443848, + "learning_rate": 9.702282340878493e-05, + "loss": 0.07, + "step": 3850 + }, + { + "epoch": 0.22660561230480217, + "grad_norm": 2.0454747676849365, + "learning_rate": 9.700708450849732e-05, + "loss": 0.0329, + "step": 3860 + }, + { + "epoch": 0.22719267347657626, + "grad_norm": 2.070821523666382, + "learning_rate": 9.69913053992278e-05, + "loss": 0.047, + "step": 3870 + }, + { + "epoch": 0.22777973464835036, + "grad_norm": 1.7548588514328003, + "learning_rate": 9.697548609447355e-05, + "loss": 0.046, + "step": 3880 + }, + { + "epoch": 0.22836679582012445, + "grad_norm": 1.6130026578903198, + "learning_rate": 9.695962660776607e-05, + "loss": 0.0317, + "step": 3890 + }, + { + "epoch": 0.22895385699189855, + "grad_norm": 0.5520845651626587, + "learning_rate": 9.694372695267131e-05, + "loss": 0.0446, + "step": 3900 + }, + { + "epoch": 0.22954091816367264, + "grad_norm": 2.591543436050415, + "learning_rate": 9.692778714278952e-05, + "loss": 0.0391, + "step": 3910 + }, + { + "epoch": 0.23012797933544676, + "grad_norm": 5.135702133178711, + "learning_rate": 9.69118071917553e-05, + "loss": 0.0307, + "step": 3920 + }, + { + "epoch": 0.23071504050722086, + "grad_norm": 1.5746864080429077, + "learning_rate": 9.689578711323761e-05, + "loss": 0.0368, + "step": 3930 + }, + { + "epoch": 0.23130210167899495, + "grad_norm": 1.950485110282898, + "learning_rate": 9.687972692093973e-05, + "loss": 0.036, + "step": 3940 + }, + { + "epoch": 0.23188916285076905, + "grad_norm": 0.6898679733276367, + "learning_rate": 9.686362662859927e-05, + "loss": 0.0322, + "step": 3950 + }, + { + "epoch": 0.23247622402254314, + "grad_norm": 0.18586641550064087, + "learning_rate": 9.68474862499881e-05, + "loss": 0.0426, + "step": 3960 + }, + { + "epoch": 0.23306328519431724, + "grad_norm": 0.7819411754608154, + "learning_rate": 9.683130579891238e-05, + "loss": 0.0424, + "step": 3970 + }, + { + "epoch": 0.23365034636609136, + "grad_norm": 0.5046377182006836, + "learning_rate": 9.68150852892126e-05, + "loss": 0.0499, + "step": 3980 + }, + { + "epoch": 0.23423740753786546, + "grad_norm": 3.63139009475708, + "learning_rate": 9.679882473476344e-05, + "loss": 0.0306, + "step": 3990 + }, + { + "epoch": 0.23482446870963955, + "grad_norm": 2.4393115043640137, + "learning_rate": 9.67825241494739e-05, + "loss": 0.0271, + "step": 4000 + }, + { + "epoch": 0.23541152988141364, + "grad_norm": 2.4522364139556885, + "learning_rate": 9.676618354728722e-05, + "loss": 0.0361, + "step": 4010 + }, + { + "epoch": 0.23599859105318774, + "grad_norm": 1.2558209896087646, + "learning_rate": 9.67498029421808e-05, + "loss": 0.0701, + "step": 4020 + }, + { + "epoch": 0.23658565222496183, + "grad_norm": 3.0067551136016846, + "learning_rate": 9.673338234816632e-05, + "loss": 0.0604, + "step": 4030 + }, + { + "epoch": 0.23717271339673593, + "grad_norm": 4.278449535369873, + "learning_rate": 9.671692177928966e-05, + "loss": 0.0558, + "step": 4040 + }, + { + "epoch": 0.23775977456851005, + "grad_norm": 1.1294217109680176, + "learning_rate": 9.670042124963087e-05, + "loss": 0.0417, + "step": 4050 + }, + { + "epoch": 0.23834683574028415, + "grad_norm": 1.9507063627243042, + "learning_rate": 9.668388077330421e-05, + "loss": 0.0321, + "step": 4060 + }, + { + "epoch": 0.23893389691205824, + "grad_norm": 0.8584306240081787, + "learning_rate": 9.666730036445809e-05, + "loss": 0.045, + "step": 4070 + }, + { + "epoch": 0.23952095808383234, + "grad_norm": 1.6064647436141968, + "learning_rate": 9.665068003727507e-05, + "loss": 0.0373, + "step": 4080 + }, + { + "epoch": 0.24010801925560643, + "grad_norm": 1.9879229068756104, + "learning_rate": 9.663401980597188e-05, + "loss": 0.0437, + "step": 4090 + }, + { + "epoch": 0.24069508042738053, + "grad_norm": 0.5784907341003418, + "learning_rate": 9.661731968479936e-05, + "loss": 0.041, + "step": 4100 + }, + { + "epoch": 0.24128214159915462, + "grad_norm": 0.7829803824424744, + "learning_rate": 9.660057968804249e-05, + "loss": 0.028, + "step": 4110 + }, + { + "epoch": 0.24186920277092874, + "grad_norm": 1.7535614967346191, + "learning_rate": 9.658379983002035e-05, + "loss": 0.0278, + "step": 4120 + }, + { + "epoch": 0.24245626394270284, + "grad_norm": 2.482191562652588, + "learning_rate": 9.65669801250861e-05, + "loss": 0.0459, + "step": 4130 + }, + { + "epoch": 0.24304332511447693, + "grad_norm": 3.86083984375, + "learning_rate": 9.655012058762703e-05, + "loss": 0.0457, + "step": 4140 + }, + { + "epoch": 0.24363038628625103, + "grad_norm": 1.0339630842208862, + "learning_rate": 9.653322123206445e-05, + "loss": 0.0268, + "step": 4150 + }, + { + "epoch": 0.24421744745802512, + "grad_norm": 1.7193056344985962, + "learning_rate": 9.651628207285377e-05, + "loss": 0.0328, + "step": 4160 + }, + { + "epoch": 0.24480450862979922, + "grad_norm": 2.191051721572876, + "learning_rate": 9.649930312448441e-05, + "loss": 0.0313, + "step": 4170 + }, + { + "epoch": 0.2453915698015733, + "grad_norm": 3.469489097595215, + "learning_rate": 9.648228440147987e-05, + "loss": 0.0594, + "step": 4180 + }, + { + "epoch": 0.24597863097334743, + "grad_norm": 0.9163156151771545, + "learning_rate": 9.646522591839764e-05, + "loss": 0.024, + "step": 4190 + }, + { + "epoch": 0.24656569214512153, + "grad_norm": 2.414659261703491, + "learning_rate": 9.64481276898292e-05, + "loss": 0.0557, + "step": 4200 + }, + { + "epoch": 0.24715275331689562, + "grad_norm": 2.6754441261291504, + "learning_rate": 9.64309897304001e-05, + "loss": 0.0529, + "step": 4210 + }, + { + "epoch": 0.24773981448866972, + "grad_norm": 2.449244499206543, + "learning_rate": 9.641381205476981e-05, + "loss": 0.0515, + "step": 4220 + }, + { + "epoch": 0.2483268756604438, + "grad_norm": 1.6271411180496216, + "learning_rate": 9.639659467763178e-05, + "loss": 0.0465, + "step": 4230 + }, + { + "epoch": 0.2489139368322179, + "grad_norm": 4.679442882537842, + "learning_rate": 9.637933761371345e-05, + "loss": 0.0404, + "step": 4240 + }, + { + "epoch": 0.249500998003992, + "grad_norm": 2.5515072345733643, + "learning_rate": 9.636204087777618e-05, + "loss": 0.0464, + "step": 4250 + }, + { + "epoch": 0.2500880591757661, + "grad_norm": 2.178102731704712, + "learning_rate": 9.63447044846153e-05, + "loss": 0.0574, + "step": 4260 + }, + { + "epoch": 0.2506751203475402, + "grad_norm": 0.9000117778778076, + "learning_rate": 9.632732844906e-05, + "loss": 0.0484, + "step": 4270 + }, + { + "epoch": 0.2512621815193143, + "grad_norm": 3.017303705215454, + "learning_rate": 9.630991278597344e-05, + "loss": 0.0692, + "step": 4280 + }, + { + "epoch": 0.25184924269108844, + "grad_norm": 0.9426410794258118, + "learning_rate": 9.629245751025262e-05, + "loss": 0.0574, + "step": 4290 + }, + { + "epoch": 0.25243630386286253, + "grad_norm": 2.272217273712158, + "learning_rate": 9.62749626368285e-05, + "loss": 0.0472, + "step": 4300 + }, + { + "epoch": 0.2530233650346366, + "grad_norm": 4.943531513214111, + "learning_rate": 9.625742818066586e-05, + "loss": 0.0659, + "step": 4310 + }, + { + "epoch": 0.2536104262064107, + "grad_norm": 2.196079730987549, + "learning_rate": 9.623985415676332e-05, + "loss": 0.0466, + "step": 4320 + }, + { + "epoch": 0.2541974873781848, + "grad_norm": 2.0745205879211426, + "learning_rate": 9.622224058015339e-05, + "loss": 0.0356, + "step": 4330 + }, + { + "epoch": 0.2547845485499589, + "grad_norm": 2.846806049346924, + "learning_rate": 9.62045874659024e-05, + "loss": 0.0741, + "step": 4340 + }, + { + "epoch": 0.255371609721733, + "grad_norm": 2.5163679122924805, + "learning_rate": 9.618689482911047e-05, + "loss": 0.0425, + "step": 4350 + }, + { + "epoch": 0.2559586708935071, + "grad_norm": 1.4826740026474, + "learning_rate": 9.616916268491158e-05, + "loss": 0.0234, + "step": 4360 + }, + { + "epoch": 0.2565457320652812, + "grad_norm": 0.04228798300027847, + "learning_rate": 9.615139104847348e-05, + "loss": 0.0416, + "step": 4370 + }, + { + "epoch": 0.2571327932370553, + "grad_norm": 0.12215807288885117, + "learning_rate": 9.613357993499766e-05, + "loss": 0.0608, + "step": 4380 + }, + { + "epoch": 0.2577198544088294, + "grad_norm": 1.0769306421279907, + "learning_rate": 9.611572935971941e-05, + "loss": 0.0694, + "step": 4390 + }, + { + "epoch": 0.2583069155806035, + "grad_norm": 1.0067722797393799, + "learning_rate": 9.609783933790784e-05, + "loss": 0.0328, + "step": 4400 + }, + { + "epoch": 0.2588939767523776, + "grad_norm": 1.8800987005233765, + "learning_rate": 9.607990988486568e-05, + "loss": 0.0487, + "step": 4410 + }, + { + "epoch": 0.25948103792415167, + "grad_norm": 2.109830617904663, + "learning_rate": 9.606194101592947e-05, + "loss": 0.0328, + "step": 4420 + }, + { + "epoch": 0.2600680990959258, + "grad_norm": 1.2077661752700806, + "learning_rate": 9.604393274646945e-05, + "loss": 0.031, + "step": 4430 + }, + { + "epoch": 0.2606551602676999, + "grad_norm": 2.6591012477874756, + "learning_rate": 9.602588509188954e-05, + "loss": 0.0343, + "step": 4440 + }, + { + "epoch": 0.261242221439474, + "grad_norm": 2.237421989440918, + "learning_rate": 9.600779806762738e-05, + "loss": 0.0452, + "step": 4450 + }, + { + "epoch": 0.2618292826112481, + "grad_norm": 3.302521228790283, + "learning_rate": 9.59896716891543e-05, + "loss": 0.0444, + "step": 4460 + }, + { + "epoch": 0.2624163437830222, + "grad_norm": 2.2960593700408936, + "learning_rate": 9.59715059719752e-05, + "loss": 0.0466, + "step": 4470 + }, + { + "epoch": 0.2630034049547963, + "grad_norm": 2.3601245880126953, + "learning_rate": 9.595330093162876e-05, + "loss": 0.0435, + "step": 4480 + }, + { + "epoch": 0.2635904661265704, + "grad_norm": 0.5983108282089233, + "learning_rate": 9.593505658368718e-05, + "loss": 0.0348, + "step": 4490 + }, + { + "epoch": 0.2641775272983445, + "grad_norm": 1.6568204164505005, + "learning_rate": 9.591677294375636e-05, + "loss": 0.0423, + "step": 4500 + }, + { + "epoch": 0.2647645884701186, + "grad_norm": 1.6575958728790283, + "learning_rate": 9.58984500274758e-05, + "loss": 0.0346, + "step": 4510 + }, + { + "epoch": 0.2653516496418927, + "grad_norm": 1.7203445434570312, + "learning_rate": 9.588008785051854e-05, + "loss": 0.0706, + "step": 4520 + }, + { + "epoch": 0.26593871081366677, + "grad_norm": 2.2138168811798096, + "learning_rate": 9.586168642859128e-05, + "loss": 0.0424, + "step": 4530 + }, + { + "epoch": 0.26652577198544086, + "grad_norm": 3.437802314758301, + "learning_rate": 9.584324577743425e-05, + "loss": 0.0374, + "step": 4540 + }, + { + "epoch": 0.26711283315721496, + "grad_norm": 0.9754136800765991, + "learning_rate": 9.582476591282119e-05, + "loss": 0.0486, + "step": 4550 + }, + { + "epoch": 0.2676998943289891, + "grad_norm": 5.477251052856445, + "learning_rate": 9.58062468505595e-05, + "loss": 0.0516, + "step": 4560 + }, + { + "epoch": 0.2682869555007632, + "grad_norm": 3.3450748920440674, + "learning_rate": 9.578768860649e-05, + "loss": 0.0409, + "step": 4570 + }, + { + "epoch": 0.2688740166725373, + "grad_norm": 2.78643536567688, + "learning_rate": 9.576909119648705e-05, + "loss": 0.0621, + "step": 4580 + }, + { + "epoch": 0.2694610778443114, + "grad_norm": 3.63116192817688, + "learning_rate": 9.575045463645858e-05, + "loss": 0.0757, + "step": 4590 + }, + { + "epoch": 0.2700481390160855, + "grad_norm": 2.1244685649871826, + "learning_rate": 9.573177894234591e-05, + "loss": 0.0418, + "step": 4600 + }, + { + "epoch": 0.2706352001878596, + "grad_norm": 4.1109724044799805, + "learning_rate": 9.571306413012388e-05, + "loss": 0.059, + "step": 4610 + }, + { + "epoch": 0.2712222613596337, + "grad_norm": 2.585906982421875, + "learning_rate": 9.569431021580082e-05, + "loss": 0.0388, + "step": 4620 + }, + { + "epoch": 0.27180932253140777, + "grad_norm": 3.3618996143341064, + "learning_rate": 9.567551721541846e-05, + "loss": 0.0511, + "step": 4630 + }, + { + "epoch": 0.27239638370318187, + "grad_norm": 0.7405043840408325, + "learning_rate": 9.565668514505199e-05, + "loss": 0.0513, + "step": 4640 + }, + { + "epoch": 0.27298344487495596, + "grad_norm": 2.23223614692688, + "learning_rate": 9.563781402081e-05, + "loss": 0.0417, + "step": 4650 + }, + { + "epoch": 0.27357050604673006, + "grad_norm": 1.4202497005462646, + "learning_rate": 9.56189038588345e-05, + "loss": 0.0411, + "step": 4660 + }, + { + "epoch": 0.27415756721850415, + "grad_norm": 3.3020284175872803, + "learning_rate": 9.559995467530091e-05, + "loss": 0.0371, + "step": 4670 + }, + { + "epoch": 0.27474462839027824, + "grad_norm": 1.8571077585220337, + "learning_rate": 9.558096648641797e-05, + "loss": 0.0443, + "step": 4680 + }, + { + "epoch": 0.27533168956205234, + "grad_norm": 1.0412755012512207, + "learning_rate": 9.556193930842785e-05, + "loss": 0.0393, + "step": 4690 + }, + { + "epoch": 0.2759187507338265, + "grad_norm": 0.8517029881477356, + "learning_rate": 9.554287315760603e-05, + "loss": 0.0631, + "step": 4700 + }, + { + "epoch": 0.2765058119056006, + "grad_norm": 2.1072020530700684, + "learning_rate": 9.552376805026136e-05, + "loss": 0.051, + "step": 4710 + }, + { + "epoch": 0.2770928730773747, + "grad_norm": 1.2897216081619263, + "learning_rate": 9.550462400273596e-05, + "loss": 0.0242, + "step": 4720 + }, + { + "epoch": 0.2776799342491488, + "grad_norm": 0.9732093811035156, + "learning_rate": 9.54854410314053e-05, + "loss": 0.0276, + "step": 4730 + }, + { + "epoch": 0.27826699542092287, + "grad_norm": 3.7374629974365234, + "learning_rate": 9.546621915267815e-05, + "loss": 0.0536, + "step": 4740 + }, + { + "epoch": 0.27885405659269696, + "grad_norm": 2.724569320678711, + "learning_rate": 9.544695838299653e-05, + "loss": 0.0472, + "step": 4750 + }, + { + "epoch": 0.27944111776447106, + "grad_norm": 3.4818320274353027, + "learning_rate": 9.542765873883577e-05, + "loss": 0.0608, + "step": 4760 + }, + { + "epoch": 0.28002817893624515, + "grad_norm": 2.309112787246704, + "learning_rate": 9.540832023670439e-05, + "loss": 0.0567, + "step": 4770 + }, + { + "epoch": 0.28061524010801925, + "grad_norm": 1.942906379699707, + "learning_rate": 9.53889428931442e-05, + "loss": 0.0498, + "step": 4780 + }, + { + "epoch": 0.28120230127979334, + "grad_norm": 2.6248302459716797, + "learning_rate": 9.536952672473021e-05, + "loss": 0.0549, + "step": 4790 + }, + { + "epoch": 0.28178936245156744, + "grad_norm": 3.351979970932007, + "learning_rate": 9.535007174807066e-05, + "loss": 0.0536, + "step": 4800 + }, + { + "epoch": 0.28237642362334153, + "grad_norm": 1.2752174139022827, + "learning_rate": 9.533057797980696e-05, + "loss": 0.0547, + "step": 4810 + }, + { + "epoch": 0.2829634847951156, + "grad_norm": 1.991886854171753, + "learning_rate": 9.531104543661374e-05, + "loss": 0.0186, + "step": 4820 + }, + { + "epoch": 0.2835505459668898, + "grad_norm": 1.6250778436660767, + "learning_rate": 9.529147413519873e-05, + "loss": 0.0253, + "step": 4830 + }, + { + "epoch": 0.2841376071386639, + "grad_norm": 1.0819950103759766, + "learning_rate": 9.52718640923029e-05, + "loss": 0.0408, + "step": 4840 + }, + { + "epoch": 0.28472466831043797, + "grad_norm": 1.681856632232666, + "learning_rate": 9.525221532470029e-05, + "loss": 0.0326, + "step": 4850 + }, + { + "epoch": 0.28531172948221206, + "grad_norm": 1.9015501737594604, + "learning_rate": 9.523252784919809e-05, + "loss": 0.0996, + "step": 4860 + }, + { + "epoch": 0.28589879065398616, + "grad_norm": 1.970300555229187, + "learning_rate": 9.52128016826366e-05, + "loss": 0.0497, + "step": 4870 + }, + { + "epoch": 0.28648585182576025, + "grad_norm": 1.5843287706375122, + "learning_rate": 9.519303684188922e-05, + "loss": 0.0567, + "step": 4880 + }, + { + "epoch": 0.28707291299753435, + "grad_norm": 1.746987223625183, + "learning_rate": 9.517323334386244e-05, + "loss": 0.0338, + "step": 4890 + }, + { + "epoch": 0.28765997416930844, + "grad_norm": 1.4393055438995361, + "learning_rate": 9.515339120549576e-05, + "loss": 0.0309, + "step": 4900 + }, + { + "epoch": 0.28824703534108254, + "grad_norm": 2.5302774906158447, + "learning_rate": 9.513351044376182e-05, + "loss": 0.0284, + "step": 4910 + }, + { + "epoch": 0.28883409651285663, + "grad_norm": 1.0148005485534668, + "learning_rate": 9.51135910756662e-05, + "loss": 0.0376, + "step": 4920 + }, + { + "epoch": 0.2894211576846307, + "grad_norm": 3.0846927165985107, + "learning_rate": 9.509363311824761e-05, + "loss": 0.0443, + "step": 4930 + }, + { + "epoch": 0.2900082188564048, + "grad_norm": 2.7418041229248047, + "learning_rate": 9.507363658857768e-05, + "loss": 0.0341, + "step": 4940 + }, + { + "epoch": 0.2905952800281789, + "grad_norm": 3.865344762802124, + "learning_rate": 9.505360150376109e-05, + "loss": 0.0427, + "step": 4950 + }, + { + "epoch": 0.291182341199953, + "grad_norm": 2.493309736251831, + "learning_rate": 9.503352788093547e-05, + "loss": 0.04, + "step": 4960 + }, + { + "epoch": 0.29176940237172716, + "grad_norm": 1.4252429008483887, + "learning_rate": 9.501341573727141e-05, + "loss": 0.0347, + "step": 4970 + }, + { + "epoch": 0.29235646354350125, + "grad_norm": 2.8794472217559814, + "learning_rate": 9.499326508997246e-05, + "loss": 0.0255, + "step": 4980 + }, + { + "epoch": 0.29294352471527535, + "grad_norm": 2.3145668506622314, + "learning_rate": 9.497307595627511e-05, + "loss": 0.0423, + "step": 4990 + }, + { + "epoch": 0.29353058588704944, + "grad_norm": 2.2408437728881836, + "learning_rate": 9.495284835344879e-05, + "loss": 0.0254, + "step": 5000 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 3.4195361137390137, + "learning_rate": 9.49325822987958e-05, + "loss": 0.0522, + "step": 5010 + }, + { + "epoch": 0.29470470823059763, + "grad_norm": 2.4050040245056152, + "learning_rate": 9.491227780965136e-05, + "loss": 0.0453, + "step": 5020 + }, + { + "epoch": 0.29529176940237173, + "grad_norm": 4.142215251922607, + "learning_rate": 9.48919349033835e-05, + "loss": 0.0296, + "step": 5030 + }, + { + "epoch": 0.2958788305741458, + "grad_norm": 8.010381698608398, + "learning_rate": 9.487155359739321e-05, + "loss": 0.0479, + "step": 5040 + }, + { + "epoch": 0.2964658917459199, + "grad_norm": 1.5859040021896362, + "learning_rate": 9.485113390911427e-05, + "loss": 0.0325, + "step": 5050 + }, + { + "epoch": 0.297052952917694, + "grad_norm": 2.35192608833313, + "learning_rate": 9.483067585601327e-05, + "loss": 0.0547, + "step": 5060 + }, + { + "epoch": 0.2976400140894681, + "grad_norm": 2.094485282897949, + "learning_rate": 9.481017945558969e-05, + "loss": 0.0534, + "step": 5070 + }, + { + "epoch": 0.2982270752612422, + "grad_norm": 0.1226126030087471, + "learning_rate": 9.478964472537575e-05, + "loss": 0.0374, + "step": 5080 + }, + { + "epoch": 0.2988141364330163, + "grad_norm": 1.314904808998108, + "learning_rate": 9.476907168293646e-05, + "loss": 0.0504, + "step": 5090 + }, + { + "epoch": 0.2994011976047904, + "grad_norm": 1.3122048377990723, + "learning_rate": 9.474846034586964e-05, + "loss": 0.0447, + "step": 5100 + }, + { + "epoch": 0.29998825877656454, + "grad_norm": 2.8994345664978027, + "learning_rate": 9.472781073180582e-05, + "loss": 0.0457, + "step": 5110 + }, + { + "epoch": 0.30057531994833864, + "grad_norm": 1.1444993019104004, + "learning_rate": 9.47071228584083e-05, + "loss": 0.0401, + "step": 5120 + }, + { + "epoch": 0.30116238112011273, + "grad_norm": 2.2754225730895996, + "learning_rate": 9.468639674337312e-05, + "loss": 0.0383, + "step": 5130 + }, + { + "epoch": 0.3017494422918868, + "grad_norm": 2.3510823249816895, + "learning_rate": 9.466563240442901e-05, + "loss": 0.0406, + "step": 5140 + }, + { + "epoch": 0.3023365034636609, + "grad_norm": 2.2636609077453613, + "learning_rate": 9.464482985933736e-05, + "loss": 0.067, + "step": 5150 + }, + { + "epoch": 0.302923564635435, + "grad_norm": 2.818981647491455, + "learning_rate": 9.462398912589232e-05, + "loss": 0.0427, + "step": 5160 + }, + { + "epoch": 0.3035106258072091, + "grad_norm": 4.39980411529541, + "learning_rate": 9.460311022192064e-05, + "loss": 0.0729, + "step": 5170 + }, + { + "epoch": 0.3040976869789832, + "grad_norm": 3.873307704925537, + "learning_rate": 9.458219316528175e-05, + "loss": 0.0388, + "step": 5180 + }, + { + "epoch": 0.3046847481507573, + "grad_norm": 1.6399500370025635, + "learning_rate": 9.456123797386771e-05, + "loss": 0.0602, + "step": 5190 + }, + { + "epoch": 0.3052718093225314, + "grad_norm": 1.9389368295669556, + "learning_rate": 9.45402446656032e-05, + "loss": 0.0426, + "step": 5200 + }, + { + "epoch": 0.3058588704943055, + "grad_norm": 4.763432025909424, + "learning_rate": 9.451921325844551e-05, + "loss": 0.0593, + "step": 5210 + }, + { + "epoch": 0.3064459316660796, + "grad_norm": 2.46632981300354, + "learning_rate": 9.449814377038452e-05, + "loss": 0.039, + "step": 5220 + }, + { + "epoch": 0.3070329928378537, + "grad_norm": 2.234233856201172, + "learning_rate": 9.447703621944264e-05, + "loss": 0.0419, + "step": 5230 + }, + { + "epoch": 0.30762005400962783, + "grad_norm": 1.779569149017334, + "learning_rate": 9.445589062367491e-05, + "loss": 0.0483, + "step": 5240 + }, + { + "epoch": 0.3082071151814019, + "grad_norm": 2.168407917022705, + "learning_rate": 9.443470700116887e-05, + "loss": 0.0557, + "step": 5250 + }, + { + "epoch": 0.308794176353176, + "grad_norm": 1.59275221824646, + "learning_rate": 9.441348537004459e-05, + "loss": 0.0371, + "step": 5260 + }, + { + "epoch": 0.3093812375249501, + "grad_norm": 2.929985761642456, + "learning_rate": 9.439222574845465e-05, + "loss": 0.0408, + "step": 5270 + }, + { + "epoch": 0.3099682986967242, + "grad_norm": 2.250945806503296, + "learning_rate": 9.437092815458415e-05, + "loss": 0.0342, + "step": 5280 + }, + { + "epoch": 0.3105553598684983, + "grad_norm": 3.255429983139038, + "learning_rate": 9.434959260665064e-05, + "loss": 0.0642, + "step": 5290 + }, + { + "epoch": 0.3111424210402724, + "grad_norm": 1.4585903882980347, + "learning_rate": 9.432821912290414e-05, + "loss": 0.0609, + "step": 5300 + }, + { + "epoch": 0.3117294822120465, + "grad_norm": 2.1435985565185547, + "learning_rate": 9.430680772162716e-05, + "loss": 0.0427, + "step": 5310 + }, + { + "epoch": 0.3123165433838206, + "grad_norm": 2.1471781730651855, + "learning_rate": 9.428535842113459e-05, + "loss": 0.0386, + "step": 5320 + }, + { + "epoch": 0.3129036045555947, + "grad_norm": 2.391707420349121, + "learning_rate": 9.426387123977378e-05, + "loss": 0.0416, + "step": 5330 + }, + { + "epoch": 0.3134906657273688, + "grad_norm": 1.038680076599121, + "learning_rate": 9.424234619592442e-05, + "loss": 0.0342, + "step": 5340 + }, + { + "epoch": 0.3140777268991429, + "grad_norm": 4.7923808097839355, + "learning_rate": 9.422078330799868e-05, + "loss": 0.027, + "step": 5350 + }, + { + "epoch": 0.31466478807091697, + "grad_norm": 2.100338935852051, + "learning_rate": 9.419918259444104e-05, + "loss": 0.062, + "step": 5360 + }, + { + "epoch": 0.31525184924269106, + "grad_norm": 4.071483135223389, + "learning_rate": 9.417754407372832e-05, + "loss": 0.0329, + "step": 5370 + }, + { + "epoch": 0.3158389104144652, + "grad_norm": 1.9182132482528687, + "learning_rate": 9.415586776436973e-05, + "loss": 0.0441, + "step": 5380 + }, + { + "epoch": 0.3164259715862393, + "grad_norm": 2.150156259536743, + "learning_rate": 9.413415368490678e-05, + "loss": 0.0367, + "step": 5390 + }, + { + "epoch": 0.3170130327580134, + "grad_norm": 0.9288743734359741, + "learning_rate": 9.411240185391327e-05, + "loss": 0.077, + "step": 5400 + }, + { + "epoch": 0.3176000939297875, + "grad_norm": 3.840372323989868, + "learning_rate": 9.409061228999533e-05, + "loss": 0.034, + "step": 5410 + }, + { + "epoch": 0.3181871551015616, + "grad_norm": 1.9792379140853882, + "learning_rate": 9.406878501179135e-05, + "loss": 0.052, + "step": 5420 + }, + { + "epoch": 0.3187742162733357, + "grad_norm": 2.395099401473999, + "learning_rate": 9.404692003797196e-05, + "loss": 0.0381, + "step": 5430 + }, + { + "epoch": 0.3193612774451098, + "grad_norm": 4.288181781768799, + "learning_rate": 9.402501738724004e-05, + "loss": 0.0565, + "step": 5440 + }, + { + "epoch": 0.3199483386168839, + "grad_norm": 1.6563748121261597, + "learning_rate": 9.400307707833074e-05, + "loss": 0.0548, + "step": 5450 + }, + { + "epoch": 0.32053539978865797, + "grad_norm": 1.2033774852752686, + "learning_rate": 9.398109913001136e-05, + "loss": 0.0386, + "step": 5460 + }, + { + "epoch": 0.32112246096043207, + "grad_norm": 1.8615103960037231, + "learning_rate": 9.395908356108145e-05, + "loss": 0.074, + "step": 5470 + }, + { + "epoch": 0.32170952213220616, + "grad_norm": 1.8725007772445679, + "learning_rate": 9.393703039037269e-05, + "loss": 0.0544, + "step": 5480 + }, + { + "epoch": 0.32229658330398026, + "grad_norm": 0.7631556987762451, + "learning_rate": 9.391493963674899e-05, + "loss": 0.0394, + "step": 5490 + }, + { + "epoch": 0.32288364447575435, + "grad_norm": 2.4178078174591064, + "learning_rate": 9.389281131910633e-05, + "loss": 0.0319, + "step": 5500 + }, + { + "epoch": 0.32347070564752844, + "grad_norm": 2.9834976196289062, + "learning_rate": 9.387064545637287e-05, + "loss": 0.073, + "step": 5510 + }, + { + "epoch": 0.3240577668193026, + "grad_norm": 0.6918954849243164, + "learning_rate": 9.384844206750889e-05, + "loss": 0.0328, + "step": 5520 + }, + { + "epoch": 0.3246448279910767, + "grad_norm": 1.4635518789291382, + "learning_rate": 9.382620117150673e-05, + "loss": 0.0397, + "step": 5530 + }, + { + "epoch": 0.3252318891628508, + "grad_norm": 1.4960331916809082, + "learning_rate": 9.380392278739085e-05, + "loss": 0.0516, + "step": 5540 + }, + { + "epoch": 0.3258189503346249, + "grad_norm": 1.3857059478759766, + "learning_rate": 9.378160693421778e-05, + "loss": 0.0373, + "step": 5550 + }, + { + "epoch": 0.326406011506399, + "grad_norm": 6.356856822967529, + "learning_rate": 9.375925363107604e-05, + "loss": 0.0595, + "step": 5560 + }, + { + "epoch": 0.32699307267817307, + "grad_norm": 1.6493016481399536, + "learning_rate": 9.373686289708629e-05, + "loss": 0.07, + "step": 5570 + }, + { + "epoch": 0.32758013384994716, + "grad_norm": 1.416532278060913, + "learning_rate": 9.371443475140108e-05, + "loss": 0.0448, + "step": 5580 + }, + { + "epoch": 0.32816719502172126, + "grad_norm": 1.5396021604537964, + "learning_rate": 9.369196921320506e-05, + "loss": 0.0352, + "step": 5590 + }, + { + "epoch": 0.32875425619349535, + "grad_norm": 2.6996428966522217, + "learning_rate": 9.366946630171485e-05, + "loss": 0.069, + "step": 5600 + }, + { + "epoch": 0.32934131736526945, + "grad_norm": 1.9140021800994873, + "learning_rate": 9.364692603617899e-05, + "loss": 0.0608, + "step": 5610 + }, + { + "epoch": 0.32992837853704354, + "grad_norm": 0.6642828583717346, + "learning_rate": 9.3624348435878e-05, + "loss": 0.0351, + "step": 5620 + }, + { + "epoch": 0.33051543970881764, + "grad_norm": 2.6053738594055176, + "learning_rate": 9.360173352012436e-05, + "loss": 0.0583, + "step": 5630 + }, + { + "epoch": 0.33110250088059173, + "grad_norm": 2.699401378631592, + "learning_rate": 9.357908130826243e-05, + "loss": 0.0466, + "step": 5640 + }, + { + "epoch": 0.3316895620523659, + "grad_norm": 3.195594549179077, + "learning_rate": 9.355639181966849e-05, + "loss": 0.0535, + "step": 5650 + }, + { + "epoch": 0.33227662322414, + "grad_norm": 2.714934825897217, + "learning_rate": 9.353366507375072e-05, + "loss": 0.052, + "step": 5660 + }, + { + "epoch": 0.3328636843959141, + "grad_norm": 1.0885695219039917, + "learning_rate": 9.351090108994913e-05, + "loss": 0.0692, + "step": 5670 + }, + { + "epoch": 0.33345074556768817, + "grad_norm": 3.0591847896575928, + "learning_rate": 9.348809988773564e-05, + "loss": 0.05, + "step": 5680 + }, + { + "epoch": 0.33403780673946226, + "grad_norm": 2.081336259841919, + "learning_rate": 9.346526148661392e-05, + "loss": 0.048, + "step": 5690 + }, + { + "epoch": 0.33462486791123636, + "grad_norm": 1.2867697477340698, + "learning_rate": 9.344238590611955e-05, + "loss": 0.0555, + "step": 5700 + }, + { + "epoch": 0.33521192908301045, + "grad_norm": 3.112689733505249, + "learning_rate": 9.341947316581989e-05, + "loss": 0.0373, + "step": 5710 + }, + { + "epoch": 0.33579899025478455, + "grad_norm": 4.347721099853516, + "learning_rate": 9.339652328531403e-05, + "loss": 0.0418, + "step": 5720 + }, + { + "epoch": 0.33638605142655864, + "grad_norm": 2.7465453147888184, + "learning_rate": 9.337353628423288e-05, + "loss": 0.0381, + "step": 5730 + }, + { + "epoch": 0.33697311259833274, + "grad_norm": 1.6170954704284668, + "learning_rate": 9.335051218223912e-05, + "loss": 0.0579, + "step": 5740 + }, + { + "epoch": 0.33756017377010683, + "grad_norm": 5.724897384643555, + "learning_rate": 9.332745099902709e-05, + "loss": 0.0482, + "step": 5750 + }, + { + "epoch": 0.3381472349418809, + "grad_norm": 3.405139684677124, + "learning_rate": 9.330435275432293e-05, + "loss": 0.0331, + "step": 5760 + }, + { + "epoch": 0.338734296113655, + "grad_norm": 3.682948350906372, + "learning_rate": 9.328121746788444e-05, + "loss": 0.0878, + "step": 5770 + }, + { + "epoch": 0.3393213572854291, + "grad_norm": 4.555486679077148, + "learning_rate": 9.325804515950109e-05, + "loss": 0.0339, + "step": 5780 + }, + { + "epoch": 0.33990841845720327, + "grad_norm": 3.4030396938323975, + "learning_rate": 9.323483584899409e-05, + "loss": 0.0442, + "step": 5790 + }, + { + "epoch": 0.34049547962897736, + "grad_norm": 1.8589022159576416, + "learning_rate": 9.321158955621621e-05, + "loss": 0.0286, + "step": 5800 + }, + { + "epoch": 0.34108254080075145, + "grad_norm": 2.960686445236206, + "learning_rate": 9.318830630105188e-05, + "loss": 0.0522, + "step": 5810 + }, + { + "epoch": 0.34166960197252555, + "grad_norm": 3.721616744995117, + "learning_rate": 9.31649861034172e-05, + "loss": 0.0385, + "step": 5820 + }, + { + "epoch": 0.34225666314429964, + "grad_norm": 1.4774328470230103, + "learning_rate": 9.314162898325981e-05, + "loss": 0.058, + "step": 5830 + }, + { + "epoch": 0.34284372431607374, + "grad_norm": 1.2000024318695068, + "learning_rate": 9.311823496055896e-05, + "loss": 0.0348, + "step": 5840 + }, + { + "epoch": 0.34343078548784783, + "grad_norm": 2.8643956184387207, + "learning_rate": 9.309480405532547e-05, + "loss": 0.0294, + "step": 5850 + }, + { + "epoch": 0.34401784665962193, + "grad_norm": 1.839704990386963, + "learning_rate": 9.307133628760168e-05, + "loss": 0.0514, + "step": 5860 + }, + { + "epoch": 0.344604907831396, + "grad_norm": 1.7238141298294067, + "learning_rate": 9.30478316774615e-05, + "loss": 0.0361, + "step": 5870 + }, + { + "epoch": 0.3451919690031701, + "grad_norm": 1.615378975868225, + "learning_rate": 9.302429024501031e-05, + "loss": 0.0633, + "step": 5880 + }, + { + "epoch": 0.3457790301749442, + "grad_norm": 2.1228907108306885, + "learning_rate": 9.300071201038503e-05, + "loss": 0.0517, + "step": 5890 + }, + { + "epoch": 0.3463660913467183, + "grad_norm": 3.311354637145996, + "learning_rate": 9.297709699375403e-05, + "loss": 0.0399, + "step": 5900 + }, + { + "epoch": 0.3469531525184924, + "grad_norm": 2.3087093830108643, + "learning_rate": 9.295344521531717e-05, + "loss": 0.0353, + "step": 5910 + }, + { + "epoch": 0.34754021369026655, + "grad_norm": 2.478135108947754, + "learning_rate": 9.292975669530573e-05, + "loss": 0.0482, + "step": 5920 + }, + { + "epoch": 0.34812727486204065, + "grad_norm": 1.8561735153198242, + "learning_rate": 9.290603145398243e-05, + "loss": 0.0275, + "step": 5930 + }, + { + "epoch": 0.34871433603381474, + "grad_norm": 0.9207778573036194, + "learning_rate": 9.288226951164138e-05, + "loss": 0.0306, + "step": 5940 + }, + { + "epoch": 0.34930139720558884, + "grad_norm": 1.6693058013916016, + "learning_rate": 9.285847088860813e-05, + "loss": 0.0386, + "step": 5950 + }, + { + "epoch": 0.34988845837736293, + "grad_norm": 2.181859254837036, + "learning_rate": 9.283463560523956e-05, + "loss": 0.0371, + "step": 5960 + }, + { + "epoch": 0.350475519549137, + "grad_norm": 1.718574047088623, + "learning_rate": 9.281076368192392e-05, + "loss": 0.067, + "step": 5970 + }, + { + "epoch": 0.3510625807209111, + "grad_norm": 2.0255677700042725, + "learning_rate": 9.278685513908083e-05, + "loss": 0.0326, + "step": 5980 + }, + { + "epoch": 0.3516496418926852, + "grad_norm": 0.9796348214149475, + "learning_rate": 9.276290999716119e-05, + "loss": 0.0487, + "step": 5990 + }, + { + "epoch": 0.3522367030644593, + "grad_norm": 1.1461082696914673, + "learning_rate": 9.273892827664725e-05, + "loss": 0.0438, + "step": 6000 + }, + { + "epoch": 0.3522367030644593, + "eval_loss": 0.44839778542518616, + "eval_runtime": 269.5068, + "eval_samples_per_second": 3.506, + "eval_steps_per_second": 3.506, + "step": 6000 + }, + { + "epoch": 0.3528237642362334, + "grad_norm": 5.4352521896362305, + "learning_rate": 9.27149099980525e-05, + "loss": 0.042, + "step": 6010 + }, + { + "epoch": 0.3534108254080075, + "grad_norm": 1.9308388233184814, + "learning_rate": 9.269085518192175e-05, + "loss": 0.0837, + "step": 6020 + }, + { + "epoch": 0.3539978865797816, + "grad_norm": 1.8994724750518799, + "learning_rate": 9.266676384883101e-05, + "loss": 0.0508, + "step": 6030 + }, + { + "epoch": 0.3545849477515557, + "grad_norm": 1.7783081531524658, + "learning_rate": 9.264263601938759e-05, + "loss": 0.0614, + "step": 6040 + }, + { + "epoch": 0.3551720089233298, + "grad_norm": 2.032818555831909, + "learning_rate": 9.261847171422996e-05, + "loss": 0.0293, + "step": 6050 + }, + { + "epoch": 0.35575907009510394, + "grad_norm": 3.544628381729126, + "learning_rate": 9.259427095402782e-05, + "loss": 0.069, + "step": 6060 + }, + { + "epoch": 0.35634613126687803, + "grad_norm": 3.1308882236480713, + "learning_rate": 9.257003375948207e-05, + "loss": 0.0309, + "step": 6070 + }, + { + "epoch": 0.3569331924386521, + "grad_norm": 1.8078138828277588, + "learning_rate": 9.254576015132473e-05, + "loss": 0.0441, + "step": 6080 + }, + { + "epoch": 0.3575202536104262, + "grad_norm": 4.272693157196045, + "learning_rate": 9.252145015031899e-05, + "loss": 0.0871, + "step": 6090 + }, + { + "epoch": 0.3581073147822003, + "grad_norm": 1.6641491651535034, + "learning_rate": 9.249710377725917e-05, + "loss": 0.0547, + "step": 6100 + }, + { + "epoch": 0.3586943759539744, + "grad_norm": 1.8841357231140137, + "learning_rate": 9.247272105297074e-05, + "loss": 0.0542, + "step": 6110 + }, + { + "epoch": 0.3592814371257485, + "grad_norm": 2.427034378051758, + "learning_rate": 9.244830199831016e-05, + "loss": 0.0411, + "step": 6120 + }, + { + "epoch": 0.3598684982975226, + "grad_norm": 3.289883613586426, + "learning_rate": 9.24238466341651e-05, + "loss": 0.0389, + "step": 6130 + }, + { + "epoch": 0.3604555594692967, + "grad_norm": 3.284317970275879, + "learning_rate": 9.239935498145418e-05, + "loss": 0.0446, + "step": 6140 + }, + { + "epoch": 0.3610426206410708, + "grad_norm": 2.9233837127685547, + "learning_rate": 9.237482706112712e-05, + "loss": 0.0341, + "step": 6150 + }, + { + "epoch": 0.3616296818128449, + "grad_norm": 2.011007308959961, + "learning_rate": 9.235026289416463e-05, + "loss": 0.0177, + "step": 6160 + }, + { + "epoch": 0.362216742984619, + "grad_norm": 1.3153204917907715, + "learning_rate": 9.232566250157845e-05, + "loss": 0.0493, + "step": 6170 + }, + { + "epoch": 0.3628038041563931, + "grad_norm": 1.6673120260238647, + "learning_rate": 9.23010259044113e-05, + "loss": 0.0307, + "step": 6180 + }, + { + "epoch": 0.36339086532816717, + "grad_norm": 2.577718734741211, + "learning_rate": 9.227635312373686e-05, + "loss": 0.0457, + "step": 6190 + }, + { + "epoch": 0.3639779264999413, + "grad_norm": 1.3342965841293335, + "learning_rate": 9.225164418065976e-05, + "loss": 0.0247, + "step": 6200 + }, + { + "epoch": 0.3645649876717154, + "grad_norm": 3.58207631111145, + "learning_rate": 9.222689909631557e-05, + "loss": 0.0608, + "step": 6210 + }, + { + "epoch": 0.3651520488434895, + "grad_norm": 5.798762798309326, + "learning_rate": 9.220211789187078e-05, + "loss": 0.047, + "step": 6220 + }, + { + "epoch": 0.3657391100152636, + "grad_norm": 2.2851603031158447, + "learning_rate": 9.217730058852276e-05, + "loss": 0.0401, + "step": 6230 + }, + { + "epoch": 0.3663261711870377, + "grad_norm": 1.917502999305725, + "learning_rate": 9.215244720749979e-05, + "loss": 0.0406, + "step": 6240 + }, + { + "epoch": 0.3669132323588118, + "grad_norm": 3.289642095565796, + "learning_rate": 9.212755777006097e-05, + "loss": 0.0531, + "step": 6250 + }, + { + "epoch": 0.3675002935305859, + "grad_norm": 2.104628801345825, + "learning_rate": 9.210263229749626e-05, + "loss": 0.0447, + "step": 6260 + }, + { + "epoch": 0.36808735470236, + "grad_norm": 2.4235751628875732, + "learning_rate": 9.207767081112642e-05, + "loss": 0.0956, + "step": 6270 + }, + { + "epoch": 0.3686744158741341, + "grad_norm": 2.3653531074523926, + "learning_rate": 9.20526733323031e-05, + "loss": 0.0709, + "step": 6280 + }, + { + "epoch": 0.36926147704590817, + "grad_norm": 2.998399257659912, + "learning_rate": 9.202763988240861e-05, + "loss": 0.0372, + "step": 6290 + }, + { + "epoch": 0.36984853821768227, + "grad_norm": 1.8805534839630127, + "learning_rate": 9.200257048285615e-05, + "loss": 0.0451, + "step": 6300 + }, + { + "epoch": 0.37043559938945636, + "grad_norm": 0.5904949903488159, + "learning_rate": 9.197746515508955e-05, + "loss": 0.0405, + "step": 6310 + }, + { + "epoch": 0.37102266056123046, + "grad_norm": 2.2496538162231445, + "learning_rate": 9.195232392058353e-05, + "loss": 0.0553, + "step": 6320 + }, + { + "epoch": 0.3716097217330046, + "grad_norm": 1.8346590995788574, + "learning_rate": 9.192714680084336e-05, + "loss": 0.0338, + "step": 6330 + }, + { + "epoch": 0.3721967829047787, + "grad_norm": 1.4481120109558105, + "learning_rate": 9.19019338174051e-05, + "loss": 0.0387, + "step": 6340 + }, + { + "epoch": 0.3727838440765528, + "grad_norm": 1.3911155462265015, + "learning_rate": 9.187668499183546e-05, + "loss": 0.0384, + "step": 6350 + }, + { + "epoch": 0.3733709052483269, + "grad_norm": 4.437960624694824, + "learning_rate": 9.185140034573182e-05, + "loss": 0.0691, + "step": 6360 + }, + { + "epoch": 0.373957966420101, + "grad_norm": 1.329298734664917, + "learning_rate": 9.182607990072221e-05, + "loss": 0.054, + "step": 6370 + }, + { + "epoch": 0.3745450275918751, + "grad_norm": 1.1170673370361328, + "learning_rate": 9.180072367846523e-05, + "loss": 0.0598, + "step": 6380 + }, + { + "epoch": 0.3751320887636492, + "grad_norm": 2.182213068008423, + "learning_rate": 9.177533170065014e-05, + "loss": 0.0373, + "step": 6390 + }, + { + "epoch": 0.37571914993542327, + "grad_norm": 2.554008960723877, + "learning_rate": 9.174990398899677e-05, + "loss": 0.0936, + "step": 6400 + }, + { + "epoch": 0.37630621110719736, + "grad_norm": 1.768218994140625, + "learning_rate": 9.172444056525549e-05, + "loss": 0.0455, + "step": 6410 + }, + { + "epoch": 0.37689327227897146, + "grad_norm": 1.6359037160873413, + "learning_rate": 9.169894145120725e-05, + "loss": 0.0367, + "step": 6420 + }, + { + "epoch": 0.37748033345074555, + "grad_norm": 1.5694235563278198, + "learning_rate": 9.167340666866351e-05, + "loss": 0.0623, + "step": 6430 + }, + { + "epoch": 0.37806739462251965, + "grad_norm": 3.2600700855255127, + "learning_rate": 9.164783623946626e-05, + "loss": 0.0541, + "step": 6440 + }, + { + "epoch": 0.37865445579429374, + "grad_norm": 2.224238157272339, + "learning_rate": 9.162223018548795e-05, + "loss": 0.036, + "step": 6450 + }, + { + "epoch": 0.37924151696606784, + "grad_norm": 3.2793667316436768, + "learning_rate": 9.15965885286315e-05, + "loss": 0.0601, + "step": 6460 + }, + { + "epoch": 0.379828578137842, + "grad_norm": 0.3255864083766937, + "learning_rate": 9.157091129083037e-05, + "loss": 0.0454, + "step": 6470 + }, + { + "epoch": 0.3804156393096161, + "grad_norm": 4.192474842071533, + "learning_rate": 9.154519849404834e-05, + "loss": 0.0481, + "step": 6480 + }, + { + "epoch": 0.3810027004813902, + "grad_norm": 2.8192543983459473, + "learning_rate": 9.151945016027965e-05, + "loss": 0.0426, + "step": 6490 + }, + { + "epoch": 0.38158976165316427, + "grad_norm": 3.663104772567749, + "learning_rate": 9.149366631154899e-05, + "loss": 0.0485, + "step": 6500 + }, + { + "epoch": 0.38217682282493837, + "grad_norm": 2.2148454189300537, + "learning_rate": 9.146784696991132e-05, + "loss": 0.0432, + "step": 6510 + }, + { + "epoch": 0.38276388399671246, + "grad_norm": 0.6129756569862366, + "learning_rate": 9.144199215745206e-05, + "loss": 0.0525, + "step": 6520 + }, + { + "epoch": 0.38335094516848656, + "grad_norm": 3.0547399520874023, + "learning_rate": 9.141610189628695e-05, + "loss": 0.0611, + "step": 6530 + }, + { + "epoch": 0.38393800634026065, + "grad_norm": 1.918065071105957, + "learning_rate": 9.1390176208562e-05, + "loss": 0.0419, + "step": 6540 + }, + { + "epoch": 0.38452506751203475, + "grad_norm": 2.8584654331207275, + "learning_rate": 9.136421511645357e-05, + "loss": 0.0435, + "step": 6550 + }, + { + "epoch": 0.38511212868380884, + "grad_norm": 2.5423402786254883, + "learning_rate": 9.133821864216829e-05, + "loss": 0.0523, + "step": 6560 + }, + { + "epoch": 0.38569918985558294, + "grad_norm": 3.4923906326293945, + "learning_rate": 9.131218680794308e-05, + "loss": 0.045, + "step": 6570 + }, + { + "epoch": 0.38628625102735703, + "grad_norm": 1.9875587224960327, + "learning_rate": 9.128611963604507e-05, + "loss": 0.0555, + "step": 6580 + }, + { + "epoch": 0.3868733121991311, + "grad_norm": 2.863887071609497, + "learning_rate": 9.126001714877161e-05, + "loss": 0.0502, + "step": 6590 + }, + { + "epoch": 0.3874603733709053, + "grad_norm": 3.5061120986938477, + "learning_rate": 9.123387936845032e-05, + "loss": 0.0422, + "step": 6600 + }, + { + "epoch": 0.38804743454267937, + "grad_norm": 2.144218921661377, + "learning_rate": 9.120770631743894e-05, + "loss": 0.0693, + "step": 6610 + }, + { + "epoch": 0.38863449571445347, + "grad_norm": 2.683347225189209, + "learning_rate": 9.118149801812543e-05, + "loss": 0.038, + "step": 6620 + }, + { + "epoch": 0.38922155688622756, + "grad_norm": 0.9793893098831177, + "learning_rate": 9.115525449292786e-05, + "loss": 0.0708, + "step": 6630 + }, + { + "epoch": 0.38980861805800165, + "grad_norm": 2.2941360473632812, + "learning_rate": 9.112897576429446e-05, + "loss": 0.0376, + "step": 6640 + }, + { + "epoch": 0.39039567922977575, + "grad_norm": 2.4669976234436035, + "learning_rate": 9.110266185470358e-05, + "loss": 0.0649, + "step": 6650 + }, + { + "epoch": 0.39098274040154984, + "grad_norm": 1.426784634590149, + "learning_rate": 9.10763127866636e-05, + "loss": 0.0546, + "step": 6660 + }, + { + "epoch": 0.39156980157332394, + "grad_norm": 1.440475583076477, + "learning_rate": 9.104992858271307e-05, + "loss": 0.0359, + "step": 6670 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 0.6029365062713623, + "learning_rate": 9.102350926542051e-05, + "loss": 0.0472, + "step": 6680 + }, + { + "epoch": 0.39274392391687213, + "grad_norm": 2.413081645965576, + "learning_rate": 9.099705485738454e-05, + "loss": 0.0334, + "step": 6690 + }, + { + "epoch": 0.3933309850886462, + "grad_norm": 2.308234214782715, + "learning_rate": 9.097056538123376e-05, + "loss": 0.0515, + "step": 6700 + }, + { + "epoch": 0.3939180462604203, + "grad_norm": 1.7056807279586792, + "learning_rate": 9.094404085962676e-05, + "loss": 0.0325, + "step": 6710 + }, + { + "epoch": 0.3945051074321944, + "grad_norm": 2.2686026096343994, + "learning_rate": 9.091748131525212e-05, + "loss": 0.0264, + "step": 6720 + }, + { + "epoch": 0.3950921686039685, + "grad_norm": 3.3668978214263916, + "learning_rate": 9.089088677082838e-05, + "loss": 0.03, + "step": 6730 + }, + { + "epoch": 0.39567922977574266, + "grad_norm": 2.45164155960083, + "learning_rate": 9.086425724910403e-05, + "loss": 0.0306, + "step": 6740 + }, + { + "epoch": 0.39626629094751675, + "grad_norm": 1.7831369638442993, + "learning_rate": 9.083759277285745e-05, + "loss": 0.053, + "step": 6750 + }, + { + "epoch": 0.39685335211929085, + "grad_norm": 0.3770430386066437, + "learning_rate": 9.081089336489694e-05, + "loss": 0.0267, + "step": 6760 + }, + { + "epoch": 0.39744041329106494, + "grad_norm": 2.116936206817627, + "learning_rate": 9.078415904806068e-05, + "loss": 0.0644, + "step": 6770 + }, + { + "epoch": 0.39802747446283904, + "grad_norm": 2.9683051109313965, + "learning_rate": 9.07573898452167e-05, + "loss": 0.0488, + "step": 6780 + }, + { + "epoch": 0.39861453563461313, + "grad_norm": 1.8044674396514893, + "learning_rate": 9.073058577926287e-05, + "loss": 0.0438, + "step": 6790 + }, + { + "epoch": 0.3992015968063872, + "grad_norm": 1.5561860799789429, + "learning_rate": 9.070374687312689e-05, + "loss": 0.052, + "step": 6800 + }, + { + "epoch": 0.3997886579781613, + "grad_norm": 2.254608631134033, + "learning_rate": 9.067687314976627e-05, + "loss": 0.0403, + "step": 6810 + }, + { + "epoch": 0.4003757191499354, + "grad_norm": 2.078115463256836, + "learning_rate": 9.064996463216828e-05, + "loss": 0.0319, + "step": 6820 + }, + { + "epoch": 0.4009627803217095, + "grad_norm": 2.6184616088867188, + "learning_rate": 9.062302134334998e-05, + "loss": 0.0419, + "step": 6830 + }, + { + "epoch": 0.4015498414934836, + "grad_norm": 1.869310975074768, + "learning_rate": 9.059604330635813e-05, + "loss": 0.0411, + "step": 6840 + }, + { + "epoch": 0.4021369026652577, + "grad_norm": 1.1796845197677612, + "learning_rate": 9.056903054426927e-05, + "loss": 0.0379, + "step": 6850 + }, + { + "epoch": 0.4027239638370318, + "grad_norm": 1.6515157222747803, + "learning_rate": 9.054198308018957e-05, + "loss": 0.0372, + "step": 6860 + }, + { + "epoch": 0.4033110250088059, + "grad_norm": 1.957228183746338, + "learning_rate": 9.051490093725494e-05, + "loss": 0.0495, + "step": 6870 + }, + { + "epoch": 0.40389808618058004, + "grad_norm": 1.7962557077407837, + "learning_rate": 9.048778413863097e-05, + "loss": 0.0314, + "step": 6880 + }, + { + "epoch": 0.40448514735235414, + "grad_norm": 2.7375149726867676, + "learning_rate": 9.046063270751283e-05, + "loss": 0.0635, + "step": 6890 + }, + { + "epoch": 0.40507220852412823, + "grad_norm": 2.7061636447906494, + "learning_rate": 9.043344666712537e-05, + "loss": 0.069, + "step": 6900 + }, + { + "epoch": 0.4056592696959023, + "grad_norm": 5.234796047210693, + "learning_rate": 9.0406226040723e-05, + "loss": 0.0421, + "step": 6910 + }, + { + "epoch": 0.4062463308676764, + "grad_norm": 3.509838581085205, + "learning_rate": 9.037897085158976e-05, + "loss": 0.0471, + "step": 6920 + }, + { + "epoch": 0.4068333920394505, + "grad_norm": 2.576282024383545, + "learning_rate": 9.03516811230392e-05, + "loss": 0.0873, + "step": 6930 + }, + { + "epoch": 0.4074204532112246, + "grad_norm": 2.2423412799835205, + "learning_rate": 9.032435687841445e-05, + "loss": 0.0394, + "step": 6940 + }, + { + "epoch": 0.4080075143829987, + "grad_norm": 3.413788318634033, + "learning_rate": 9.029699814108818e-05, + "loss": 0.0897, + "step": 6950 + }, + { + "epoch": 0.4085945755547728, + "grad_norm": 3.610041618347168, + "learning_rate": 9.026960493446252e-05, + "loss": 0.0648, + "step": 6960 + }, + { + "epoch": 0.4091816367265469, + "grad_norm": 3.931255340576172, + "learning_rate": 9.024217728196913e-05, + "loss": 0.0553, + "step": 6970 + }, + { + "epoch": 0.409768697898321, + "grad_norm": 1.4837568998336792, + "learning_rate": 9.02147152070691e-05, + "loss": 0.0418, + "step": 6980 + }, + { + "epoch": 0.4103557590700951, + "grad_norm": 4.453707695007324, + "learning_rate": 9.018721873325295e-05, + "loss": 0.0511, + "step": 6990 + }, + { + "epoch": 0.4109428202418692, + "grad_norm": 4.750669479370117, + "learning_rate": 9.015968788404069e-05, + "loss": 0.061, + "step": 7000 + }, + { + "epoch": 0.41152988141364333, + "grad_norm": 1.7518789768218994, + "learning_rate": 9.013212268298168e-05, + "loss": 0.0347, + "step": 7010 + }, + { + "epoch": 0.4121169425854174, + "grad_norm": 1.1140742301940918, + "learning_rate": 9.010452315365466e-05, + "loss": 0.0689, + "step": 7020 + }, + { + "epoch": 0.4127040037571915, + "grad_norm": 1.4189730882644653, + "learning_rate": 9.007688931966778e-05, + "loss": 0.0551, + "step": 7030 + }, + { + "epoch": 0.4132910649289656, + "grad_norm": 2.33185076713562, + "learning_rate": 9.004922120465849e-05, + "loss": 0.0578, + "step": 7040 + }, + { + "epoch": 0.4138781261007397, + "grad_norm": 2.9722962379455566, + "learning_rate": 9.00215188322936e-05, + "loss": 0.0406, + "step": 7050 + }, + { + "epoch": 0.4144651872725138, + "grad_norm": 3.8774149417877197, + "learning_rate": 8.999378222626915e-05, + "loss": 0.0435, + "step": 7060 + }, + { + "epoch": 0.4150522484442879, + "grad_norm": 1.6501243114471436, + "learning_rate": 8.996601141031056e-05, + "loss": 0.029, + "step": 7070 + }, + { + "epoch": 0.415639309616062, + "grad_norm": 4.021002769470215, + "learning_rate": 8.993820640817246e-05, + "loss": 0.07, + "step": 7080 + }, + { + "epoch": 0.4162263707878361, + "grad_norm": 3.8996949195861816, + "learning_rate": 8.991036724363872e-05, + "loss": 0.0368, + "step": 7090 + }, + { + "epoch": 0.4168134319596102, + "grad_norm": 0.3041614592075348, + "learning_rate": 8.988249394052247e-05, + "loss": 0.033, + "step": 7100 + }, + { + "epoch": 0.4174004931313843, + "grad_norm": 4.124356269836426, + "learning_rate": 8.985458652266595e-05, + "loss": 0.0487, + "step": 7110 + }, + { + "epoch": 0.41798755430315837, + "grad_norm": 3.1162426471710205, + "learning_rate": 8.98266450139407e-05, + "loss": 0.0765, + "step": 7120 + }, + { + "epoch": 0.41857461547493247, + "grad_norm": 1.3574343919754028, + "learning_rate": 8.979866943824735e-05, + "loss": 0.049, + "step": 7130 + }, + { + "epoch": 0.41916167664670656, + "grad_norm": 4.351559638977051, + "learning_rate": 8.977065981951566e-05, + "loss": 0.044, + "step": 7140 + }, + { + "epoch": 0.4197487378184807, + "grad_norm": 3.6652841567993164, + "learning_rate": 8.974261618170459e-05, + "loss": 0.0481, + "step": 7150 + }, + { + "epoch": 0.4203357989902548, + "grad_norm": 2.909571409225464, + "learning_rate": 8.97145385488021e-05, + "loss": 0.0596, + "step": 7160 + }, + { + "epoch": 0.4209228601620289, + "grad_norm": 2.387852668762207, + "learning_rate": 8.968642694482527e-05, + "loss": 0.0454, + "step": 7170 + }, + { + "epoch": 0.421509921333803, + "grad_norm": 3.3645102977752686, + "learning_rate": 8.965828139382026e-05, + "loss": 0.0509, + "step": 7180 + }, + { + "epoch": 0.4220969825055771, + "grad_norm": 3.189063787460327, + "learning_rate": 8.963010191986225e-05, + "loss": 0.0626, + "step": 7190 + }, + { + "epoch": 0.4226840436773512, + "grad_norm": 4.78650426864624, + "learning_rate": 8.960188854705543e-05, + "loss": 0.0828, + "step": 7200 + }, + { + "epoch": 0.4232711048491253, + "grad_norm": 3.422757625579834, + "learning_rate": 8.957364129953297e-05, + "loss": 0.0424, + "step": 7210 + }, + { + "epoch": 0.4238581660208994, + "grad_norm": 2.0187454223632812, + "learning_rate": 8.954536020145708e-05, + "loss": 0.04, + "step": 7220 + }, + { + "epoch": 0.42444522719267347, + "grad_norm": 1.3502254486083984, + "learning_rate": 8.951704527701883e-05, + "loss": 0.0317, + "step": 7230 + }, + { + "epoch": 0.42503228836444756, + "grad_norm": 1.648254156112671, + "learning_rate": 8.948869655043835e-05, + "loss": 0.0397, + "step": 7240 + }, + { + "epoch": 0.42561934953622166, + "grad_norm": 6.5923261642456055, + "learning_rate": 8.946031404596453e-05, + "loss": 0.0565, + "step": 7250 + }, + { + "epoch": 0.42620641070799575, + "grad_norm": 2.213615655899048, + "learning_rate": 8.943189778787528e-05, + "loss": 0.051, + "step": 7260 + }, + { + "epoch": 0.42679347187976985, + "grad_norm": 1.5767220258712769, + "learning_rate": 8.940344780047736e-05, + "loss": 0.0733, + "step": 7270 + }, + { + "epoch": 0.427380533051544, + "grad_norm": 2.043565273284912, + "learning_rate": 8.937496410810631e-05, + "loss": 0.0604, + "step": 7280 + }, + { + "epoch": 0.4279675942233181, + "grad_norm": 0.9879446625709534, + "learning_rate": 8.934644673512656e-05, + "loss": 0.0681, + "step": 7290 + }, + { + "epoch": 0.4285546553950922, + "grad_norm": 2.74680757522583, + "learning_rate": 8.931789570593134e-05, + "loss": 0.0515, + "step": 7300 + }, + { + "epoch": 0.4291417165668663, + "grad_norm": 2.332568645477295, + "learning_rate": 8.928931104494267e-05, + "loss": 0.0681, + "step": 7310 + }, + { + "epoch": 0.4297287777386404, + "grad_norm": 1.6378368139266968, + "learning_rate": 8.926069277661134e-05, + "loss": 0.0606, + "step": 7320 + }, + { + "epoch": 0.43031583891041447, + "grad_norm": 3.3819258213043213, + "learning_rate": 8.923204092541688e-05, + "loss": 0.041, + "step": 7330 + }, + { + "epoch": 0.43090290008218857, + "grad_norm": 1.0741076469421387, + "learning_rate": 8.920335551586755e-05, + "loss": 0.054, + "step": 7340 + }, + { + "epoch": 0.43148996125396266, + "grad_norm": 2.399531602859497, + "learning_rate": 8.91746365725003e-05, + "loss": 0.0392, + "step": 7350 + }, + { + "epoch": 0.43207702242573676, + "grad_norm": 2.9005095958709717, + "learning_rate": 8.914588411988078e-05, + "loss": 0.0507, + "step": 7360 + }, + { + "epoch": 0.43266408359751085, + "grad_norm": 2.7736499309539795, + "learning_rate": 8.911709818260333e-05, + "loss": 0.0509, + "step": 7370 + }, + { + "epoch": 0.43325114476928495, + "grad_norm": 2.2151551246643066, + "learning_rate": 8.908827878529087e-05, + "loss": 0.0344, + "step": 7380 + }, + { + "epoch": 0.43383820594105904, + "grad_norm": 1.2291438579559326, + "learning_rate": 8.905942595259498e-05, + "loss": 0.0263, + "step": 7390 + }, + { + "epoch": 0.43442526711283314, + "grad_norm": 2.7212166786193848, + "learning_rate": 8.903053970919585e-05, + "loss": 0.0492, + "step": 7400 + }, + { + "epoch": 0.43501232828460723, + "grad_norm": 3.3400795459747314, + "learning_rate": 8.900162007980221e-05, + "loss": 0.0421, + "step": 7410 + }, + { + "epoch": 0.4355993894563814, + "grad_norm": 1.6024249792099, + "learning_rate": 8.897266708915139e-05, + "loss": 0.0447, + "step": 7420 + }, + { + "epoch": 0.4361864506281555, + "grad_norm": 1.3425102233886719, + "learning_rate": 8.894368076200923e-05, + "loss": 0.0282, + "step": 7430 + }, + { + "epoch": 0.43677351179992957, + "grad_norm": 1.875572919845581, + "learning_rate": 8.891466112317008e-05, + "loss": 0.0521, + "step": 7440 + }, + { + "epoch": 0.43736057297170366, + "grad_norm": 6.709931373596191, + "learning_rate": 8.888560819745682e-05, + "loss": 0.0498, + "step": 7450 + }, + { + "epoch": 0.43794763414347776, + "grad_norm": 2.244819402694702, + "learning_rate": 8.885652200972077e-05, + "loss": 0.0392, + "step": 7460 + }, + { + "epoch": 0.43853469531525185, + "grad_norm": 2.1222212314605713, + "learning_rate": 8.88274025848417e-05, + "loss": 0.0499, + "step": 7470 + }, + { + "epoch": 0.43912175648702595, + "grad_norm": 3.548957586288452, + "learning_rate": 8.879824994772785e-05, + "loss": 0.0544, + "step": 7480 + }, + { + "epoch": 0.43970881765880004, + "grad_norm": 1.4378647804260254, + "learning_rate": 8.876906412331582e-05, + "loss": 0.0385, + "step": 7490 + }, + { + "epoch": 0.44029587883057414, + "grad_norm": 2.783017158508301, + "learning_rate": 8.873984513657061e-05, + "loss": 0.0574, + "step": 7500 + }, + { + "epoch": 0.44088294000234823, + "grad_norm": 5.680621147155762, + "learning_rate": 8.871059301248563e-05, + "loss": 0.0567, + "step": 7510 + }, + { + "epoch": 0.44147000117412233, + "grad_norm": 2.7961585521698, + "learning_rate": 8.868130777608256e-05, + "loss": 0.0715, + "step": 7520 + }, + { + "epoch": 0.4420570623458964, + "grad_norm": 5.537487983703613, + "learning_rate": 8.865198945241147e-05, + "loss": 0.0381, + "step": 7530 + }, + { + "epoch": 0.4426441235176705, + "grad_norm": 1.4201655387878418, + "learning_rate": 8.86226380665507e-05, + "loss": 0.0256, + "step": 7540 + }, + { + "epoch": 0.4432311846894446, + "grad_norm": 1.8118871450424194, + "learning_rate": 8.859325364360687e-05, + "loss": 0.0401, + "step": 7550 + }, + { + "epoch": 0.44381824586121876, + "grad_norm": 1.7263232469558716, + "learning_rate": 8.856383620871489e-05, + "loss": 0.0249, + "step": 7560 + }, + { + "epoch": 0.44440530703299286, + "grad_norm": 1.8279962539672852, + "learning_rate": 8.853438578703786e-05, + "loss": 0.0715, + "step": 7570 + }, + { + "epoch": 0.44499236820476695, + "grad_norm": 1.603935956954956, + "learning_rate": 8.850490240376711e-05, + "loss": 0.0679, + "step": 7580 + }, + { + "epoch": 0.44557942937654105, + "grad_norm": 2.0516746044158936, + "learning_rate": 8.84753860841222e-05, + "loss": 0.0672, + "step": 7590 + }, + { + "epoch": 0.44616649054831514, + "grad_norm": 0.3904460668563843, + "learning_rate": 8.844583685335084e-05, + "loss": 0.0799, + "step": 7600 + }, + { + "epoch": 0.44675355172008924, + "grad_norm": 1.5879621505737305, + "learning_rate": 8.841625473672888e-05, + "loss": 0.053, + "step": 7610 + }, + { + "epoch": 0.44734061289186333, + "grad_norm": 1.8831861019134521, + "learning_rate": 8.838663975956031e-05, + "loss": 0.0579, + "step": 7620 + }, + { + "epoch": 0.4479276740636374, + "grad_norm": 2.4368581771850586, + "learning_rate": 8.835699194717724e-05, + "loss": 0.0558, + "step": 7630 + }, + { + "epoch": 0.4485147352354115, + "grad_norm": 1.7504124641418457, + "learning_rate": 8.832731132493982e-05, + "loss": 0.0529, + "step": 7640 + }, + { + "epoch": 0.4491017964071856, + "grad_norm": 2.665923595428467, + "learning_rate": 8.829759791823632e-05, + "loss": 0.0364, + "step": 7650 + }, + { + "epoch": 0.4496888575789597, + "grad_norm": 1.9041054248809814, + "learning_rate": 8.826785175248308e-05, + "loss": 0.0504, + "step": 7660 + }, + { + "epoch": 0.4502759187507338, + "grad_norm": 3.0372838973999023, + "learning_rate": 8.823807285312434e-05, + "loss": 0.0517, + "step": 7670 + }, + { + "epoch": 0.4508629799225079, + "grad_norm": 0.7600299119949341, + "learning_rate": 8.820826124563245e-05, + "loss": 0.052, + "step": 7680 + }, + { + "epoch": 0.45145004109428205, + "grad_norm": 2.599184274673462, + "learning_rate": 8.81784169555077e-05, + "loss": 0.0604, + "step": 7690 + }, + { + "epoch": 0.45203710226605615, + "grad_norm": 2.0229201316833496, + "learning_rate": 8.814854000827832e-05, + "loss": 0.0625, + "step": 7700 + }, + { + "epoch": 0.45262416343783024, + "grad_norm": 0.2576180100440979, + "learning_rate": 8.811863042950053e-05, + "loss": 0.0472, + "step": 7710 + }, + { + "epoch": 0.45321122460960434, + "grad_norm": 1.7950199842453003, + "learning_rate": 8.80886882447584e-05, + "loss": 0.0578, + "step": 7720 + }, + { + "epoch": 0.45379828578137843, + "grad_norm": 2.8848981857299805, + "learning_rate": 8.805871347966393e-05, + "loss": 0.0431, + "step": 7730 + }, + { + "epoch": 0.4543853469531525, + "grad_norm": 2.17931866645813, + "learning_rate": 8.802870615985694e-05, + "loss": 0.0449, + "step": 7740 + }, + { + "epoch": 0.4549724081249266, + "grad_norm": 1.8323543071746826, + "learning_rate": 8.799866631100516e-05, + "loss": 0.0474, + "step": 7750 + }, + { + "epoch": 0.4555594692967007, + "grad_norm": 3.3510382175445557, + "learning_rate": 8.79685939588041e-05, + "loss": 0.0513, + "step": 7760 + }, + { + "epoch": 0.4561465304684748, + "grad_norm": 1.8694850206375122, + "learning_rate": 8.79384891289771e-05, + "loss": 0.066, + "step": 7770 + }, + { + "epoch": 0.4567335916402489, + "grad_norm": 3.482624053955078, + "learning_rate": 8.790835184727529e-05, + "loss": 0.0741, + "step": 7780 + }, + { + "epoch": 0.457320652812023, + "grad_norm": 3.115752935409546, + "learning_rate": 8.787818213947749e-05, + "loss": 0.036, + "step": 7790 + }, + { + "epoch": 0.4579077139837971, + "grad_norm": 1.0774741172790527, + "learning_rate": 8.784798003139034e-05, + "loss": 0.0691, + "step": 7800 + }, + { + "epoch": 0.4584947751555712, + "grad_norm": 0.8824712038040161, + "learning_rate": 8.781774554884814e-05, + "loss": 0.0548, + "step": 7810 + }, + { + "epoch": 0.4590818363273453, + "grad_norm": 0.9113776087760925, + "learning_rate": 8.778747871771292e-05, + "loss": 0.0275, + "step": 7820 + }, + { + "epoch": 0.45966889749911943, + "grad_norm": 1.0432971715927124, + "learning_rate": 8.775717956387434e-05, + "loss": 0.0255, + "step": 7830 + }, + { + "epoch": 0.46025595867089353, + "grad_norm": 3.4298136234283447, + "learning_rate": 8.772684811324975e-05, + "loss": 0.0499, + "step": 7840 + }, + { + "epoch": 0.4608430198426676, + "grad_norm": 5.1340837478637695, + "learning_rate": 8.76964843917841e-05, + "loss": 0.0538, + "step": 7850 + }, + { + "epoch": 0.4614300810144417, + "grad_norm": 1.2287262678146362, + "learning_rate": 8.766608842544993e-05, + "loss": 0.0414, + "step": 7860 + }, + { + "epoch": 0.4620171421862158, + "grad_norm": 1.3155686855316162, + "learning_rate": 8.763566024024741e-05, + "loss": 0.0185, + "step": 7870 + }, + { + "epoch": 0.4626042033579899, + "grad_norm": 1.028479814529419, + "learning_rate": 8.760519986220423e-05, + "loss": 0.0441, + "step": 7880 + }, + { + "epoch": 0.463191264529764, + "grad_norm": 1.9284944534301758, + "learning_rate": 8.757470731737562e-05, + "loss": 0.0283, + "step": 7890 + }, + { + "epoch": 0.4637783257015381, + "grad_norm": 0.6915614008903503, + "learning_rate": 8.754418263184437e-05, + "loss": 0.0613, + "step": 7900 + }, + { + "epoch": 0.4643653868733122, + "grad_norm": 1.5400358438491821, + "learning_rate": 8.751362583172068e-05, + "loss": 0.0352, + "step": 7910 + }, + { + "epoch": 0.4649524480450863, + "grad_norm": 1.5949668884277344, + "learning_rate": 8.748303694314227e-05, + "loss": 0.0478, + "step": 7920 + }, + { + "epoch": 0.4655395092168604, + "grad_norm": 2.550201416015625, + "learning_rate": 8.745241599227433e-05, + "loss": 0.0384, + "step": 7930 + }, + { + "epoch": 0.4661265703886345, + "grad_norm": 2.6728291511535645, + "learning_rate": 8.742176300530944e-05, + "loss": 0.0604, + "step": 7940 + }, + { + "epoch": 0.46671363156040857, + "grad_norm": 1.924499750137329, + "learning_rate": 8.739107800846757e-05, + "loss": 0.0594, + "step": 7950 + }, + { + "epoch": 0.4673006927321827, + "grad_norm": 0.8227196931838989, + "learning_rate": 8.736036102799614e-05, + "loss": 0.0378, + "step": 7960 + }, + { + "epoch": 0.4678877539039568, + "grad_norm": 2.2939882278442383, + "learning_rate": 8.732961209016983e-05, + "loss": 0.0438, + "step": 7970 + }, + { + "epoch": 0.4684748150757309, + "grad_norm": 1.3413143157958984, + "learning_rate": 8.729883122129075e-05, + "loss": 0.0418, + "step": 7980 + }, + { + "epoch": 0.469061876247505, + "grad_norm": 2.554335832595825, + "learning_rate": 8.726801844768825e-05, + "loss": 0.0261, + "step": 7990 + }, + { + "epoch": 0.4696489374192791, + "grad_norm": 4.016257286071777, + "learning_rate": 8.7237173795719e-05, + "loss": 0.037, + "step": 8000 + }, + { + "epoch": 0.4702359985910532, + "grad_norm": 1.072961688041687, + "learning_rate": 8.720629729176697e-05, + "loss": 0.0335, + "step": 8010 + }, + { + "epoch": 0.4708230597628273, + "grad_norm": 2.696843385696411, + "learning_rate": 8.717538896224332e-05, + "loss": 0.0373, + "step": 8020 + }, + { + "epoch": 0.4714101209346014, + "grad_norm": 1.040705680847168, + "learning_rate": 8.714444883358646e-05, + "loss": 0.0493, + "step": 8030 + }, + { + "epoch": 0.4719971821063755, + "grad_norm": 1.6960396766662598, + "learning_rate": 8.711347693226201e-05, + "loss": 0.0435, + "step": 8040 + }, + { + "epoch": 0.4725842432781496, + "grad_norm": 1.921654224395752, + "learning_rate": 8.708247328476273e-05, + "loss": 0.0376, + "step": 8050 + }, + { + "epoch": 0.47317130444992367, + "grad_norm": 1.4431146383285522, + "learning_rate": 8.705143791760859e-05, + "loss": 0.0467, + "step": 8060 + }, + { + "epoch": 0.47375836562169776, + "grad_norm": 4.363770484924316, + "learning_rate": 8.702037085734664e-05, + "loss": 0.0633, + "step": 8070 + }, + { + "epoch": 0.47434542679347186, + "grad_norm": 0.7148435711860657, + "learning_rate": 8.698927213055107e-05, + "loss": 0.045, + "step": 8080 + }, + { + "epoch": 0.47493248796524595, + "grad_norm": 1.6291950941085815, + "learning_rate": 8.695814176382318e-05, + "loss": 0.0345, + "step": 8090 + }, + { + "epoch": 0.4755195491370201, + "grad_norm": 2.1824052333831787, + "learning_rate": 8.692697978379125e-05, + "loss": 0.05, + "step": 8100 + }, + { + "epoch": 0.4761066103087942, + "grad_norm": 0.47206181287765503, + "learning_rate": 8.68957862171107e-05, + "loss": 0.0892, + "step": 8110 + }, + { + "epoch": 0.4766936714805683, + "grad_norm": 2.351390838623047, + "learning_rate": 8.68645610904639e-05, + "loss": 0.0387, + "step": 8120 + }, + { + "epoch": 0.4772807326523424, + "grad_norm": 1.7469288110733032, + "learning_rate": 8.683330443056026e-05, + "loss": 0.0432, + "step": 8130 + }, + { + "epoch": 0.4778677938241165, + "grad_norm": 1.3828728199005127, + "learning_rate": 8.680201626413612e-05, + "loss": 0.0565, + "step": 8140 + }, + { + "epoch": 0.4784548549958906, + "grad_norm": 2.343574047088623, + "learning_rate": 8.677069661795479e-05, + "loss": 0.0431, + "step": 8150 + }, + { + "epoch": 0.47904191616766467, + "grad_norm": 0.9054991602897644, + "learning_rate": 8.673934551880654e-05, + "loss": 0.0439, + "step": 8160 + }, + { + "epoch": 0.47962897733943877, + "grad_norm": 1.9883683919906616, + "learning_rate": 8.67079629935085e-05, + "loss": 0.0344, + "step": 8170 + }, + { + "epoch": 0.48021603851121286, + "grad_norm": 4.34842586517334, + "learning_rate": 8.667654906890469e-05, + "loss": 0.0422, + "step": 8180 + }, + { + "epoch": 0.48080309968298696, + "grad_norm": 2.466914415359497, + "learning_rate": 8.664510377186599e-05, + "loss": 0.0428, + "step": 8190 + }, + { + "epoch": 0.48139016085476105, + "grad_norm": 3.4293391704559326, + "learning_rate": 8.661362712929013e-05, + "loss": 0.044, + "step": 8200 + }, + { + "epoch": 0.48197722202653515, + "grad_norm": 1.3960778713226318, + "learning_rate": 8.658211916810165e-05, + "loss": 0.0493, + "step": 8210 + }, + { + "epoch": 0.48256428319830924, + "grad_norm": 3.3009042739868164, + "learning_rate": 8.655057991525186e-05, + "loss": 0.0518, + "step": 8220 + }, + { + "epoch": 0.48315134437008334, + "grad_norm": 2.5035552978515625, + "learning_rate": 8.651900939771884e-05, + "loss": 0.0276, + "step": 8230 + }, + { + "epoch": 0.4837384055418575, + "grad_norm": 3.129765272140503, + "learning_rate": 8.648740764250745e-05, + "loss": 0.0557, + "step": 8240 + }, + { + "epoch": 0.4843254667136316, + "grad_norm": 3.4605374336242676, + "learning_rate": 8.645577467664919e-05, + "loss": 0.04, + "step": 8250 + }, + { + "epoch": 0.4849125278854057, + "grad_norm": 3.6204116344451904, + "learning_rate": 8.642411052720235e-05, + "loss": 0.0464, + "step": 8260 + }, + { + "epoch": 0.48549958905717977, + "grad_norm": 4.532052516937256, + "learning_rate": 8.639241522125185e-05, + "loss": 0.0689, + "step": 8270 + }, + { + "epoch": 0.48608665022895386, + "grad_norm": 1.2190687656402588, + "learning_rate": 8.636068878590924e-05, + "loss": 0.0387, + "step": 8280 + }, + { + "epoch": 0.48667371140072796, + "grad_norm": 2.1063272953033447, + "learning_rate": 8.632893124831273e-05, + "loss": 0.0677, + "step": 8290 + }, + { + "epoch": 0.48726077257250205, + "grad_norm": 1.011233925819397, + "learning_rate": 8.629714263562716e-05, + "loss": 0.0424, + "step": 8300 + }, + { + "epoch": 0.48784783374427615, + "grad_norm": 0.7251833081245422, + "learning_rate": 8.626532297504386e-05, + "loss": 0.0513, + "step": 8310 + }, + { + "epoch": 0.48843489491605024, + "grad_norm": 1.6562292575836182, + "learning_rate": 8.62334722937808e-05, + "loss": 0.0686, + "step": 8320 + }, + { + "epoch": 0.48902195608782434, + "grad_norm": 1.2332732677459717, + "learning_rate": 8.620159061908245e-05, + "loss": 0.0625, + "step": 8330 + }, + { + "epoch": 0.48960901725959843, + "grad_norm": 1.7628116607666016, + "learning_rate": 8.61696779782198e-05, + "loss": 0.02, + "step": 8340 + }, + { + "epoch": 0.49019607843137253, + "grad_norm": 2.3033394813537598, + "learning_rate": 8.613773439849034e-05, + "loss": 0.0586, + "step": 8350 + }, + { + "epoch": 0.4907831396031466, + "grad_norm": 0.787381112575531, + "learning_rate": 8.610575990721799e-05, + "loss": 0.0489, + "step": 8360 + }, + { + "epoch": 0.4913702007749208, + "grad_norm": 4.453810691833496, + "learning_rate": 8.607375453175316e-05, + "loss": 0.0537, + "step": 8370 + }, + { + "epoch": 0.49195726194669487, + "grad_norm": 1.4812476634979248, + "learning_rate": 8.604171829947263e-05, + "loss": 0.0489, + "step": 8380 + }, + { + "epoch": 0.49254432311846896, + "grad_norm": 1.9119701385498047, + "learning_rate": 8.600965123777957e-05, + "loss": 0.0457, + "step": 8390 + }, + { + "epoch": 0.49313138429024306, + "grad_norm": 1.4203485250473022, + "learning_rate": 8.59775533741036e-05, + "loss": 0.0472, + "step": 8400 + }, + { + "epoch": 0.49371844546201715, + "grad_norm": 1.6807453632354736, + "learning_rate": 8.594542473590062e-05, + "loss": 0.0393, + "step": 8410 + }, + { + "epoch": 0.49430550663379125, + "grad_norm": 1.0950032472610474, + "learning_rate": 8.591326535065283e-05, + "loss": 0.0455, + "step": 8420 + }, + { + "epoch": 0.49489256780556534, + "grad_norm": 2.4319326877593994, + "learning_rate": 8.58810752458688e-05, + "loss": 0.0638, + "step": 8430 + }, + { + "epoch": 0.49547962897733944, + "grad_norm": 1.7978637218475342, + "learning_rate": 8.584885444908333e-05, + "loss": 0.0314, + "step": 8440 + }, + { + "epoch": 0.49606669014911353, + "grad_norm": 2.297290802001953, + "learning_rate": 8.58166029878575e-05, + "loss": 0.0723, + "step": 8450 + }, + { + "epoch": 0.4966537513208876, + "grad_norm": 0.6186020374298096, + "learning_rate": 8.578432088977859e-05, + "loss": 0.061, + "step": 8460 + }, + { + "epoch": 0.4972408124926617, + "grad_norm": 0.46362197399139404, + "learning_rate": 8.575200818246012e-05, + "loss": 0.0401, + "step": 8470 + }, + { + "epoch": 0.4978278736644358, + "grad_norm": 2.507780075073242, + "learning_rate": 8.571966489354178e-05, + "loss": 0.0735, + "step": 8480 + }, + { + "epoch": 0.4984149348362099, + "grad_norm": 1.0863014459609985, + "learning_rate": 8.568729105068939e-05, + "loss": 0.0345, + "step": 8490 + }, + { + "epoch": 0.499001996007984, + "grad_norm": 2.749016523361206, + "learning_rate": 8.565488668159496e-05, + "loss": 0.0429, + "step": 8500 + }, + { + "epoch": 0.49958905717975816, + "grad_norm": 3.3777594566345215, + "learning_rate": 8.562245181397655e-05, + "loss": 0.0377, + "step": 8510 + }, + { + "epoch": 0.5001761183515322, + "grad_norm": 2.7208309173583984, + "learning_rate": 8.558998647557837e-05, + "loss": 0.035, + "step": 8520 + }, + { + "epoch": 0.5007631795233063, + "grad_norm": 1.5163689851760864, + "learning_rate": 8.555749069417065e-05, + "loss": 0.0461, + "step": 8530 + }, + { + "epoch": 0.5013502406950804, + "grad_norm": 1.4222984313964844, + "learning_rate": 8.552496449754967e-05, + "loss": 0.0585, + "step": 8540 + }, + { + "epoch": 0.5019373018668545, + "grad_norm": 2.8832669258117676, + "learning_rate": 8.549240791353775e-05, + "loss": 0.0655, + "step": 8550 + }, + { + "epoch": 0.5025243630386286, + "grad_norm": 2.0506932735443115, + "learning_rate": 8.545982096998315e-05, + "loss": 0.0449, + "step": 8560 + }, + { + "epoch": 0.5031114242104027, + "grad_norm": 6.83305025100708, + "learning_rate": 8.542720369476016e-05, + "loss": 0.0413, + "step": 8570 + }, + { + "epoch": 0.5036984853821769, + "grad_norm": 1.7522202730178833, + "learning_rate": 8.539455611576898e-05, + "loss": 0.0411, + "step": 8580 + }, + { + "epoch": 0.504285546553951, + "grad_norm": 0.4490013122558594, + "learning_rate": 8.536187826093576e-05, + "loss": 0.0509, + "step": 8590 + }, + { + "epoch": 0.5048726077257251, + "grad_norm": 1.3018290996551514, + "learning_rate": 8.53291701582125e-05, + "loss": 0.0633, + "step": 8600 + }, + { + "epoch": 0.5054596688974992, + "grad_norm": 3.179797887802124, + "learning_rate": 8.529643183557708e-05, + "loss": 0.0498, + "step": 8610 + }, + { + "epoch": 0.5060467300692733, + "grad_norm": 4.824548721313477, + "learning_rate": 8.52636633210333e-05, + "loss": 0.0539, + "step": 8620 + }, + { + "epoch": 0.5066337912410473, + "grad_norm": 2.7731845378875732, + "learning_rate": 8.52308646426107e-05, + "loss": 0.0432, + "step": 8630 + }, + { + "epoch": 0.5072208524128214, + "grad_norm": 2.331210136413574, + "learning_rate": 8.519803582836467e-05, + "loss": 0.0688, + "step": 8640 + }, + { + "epoch": 0.5078079135845955, + "grad_norm": 2.3192410469055176, + "learning_rate": 8.516517690637638e-05, + "loss": 0.0453, + "step": 8650 + }, + { + "epoch": 0.5083949747563696, + "grad_norm": 3.357856273651123, + "learning_rate": 8.513228790475269e-05, + "loss": 0.0641, + "step": 8660 + }, + { + "epoch": 0.5089820359281437, + "grad_norm": 2.9671857357025146, + "learning_rate": 8.509936885162629e-05, + "loss": 0.0401, + "step": 8670 + }, + { + "epoch": 0.5095690970999178, + "grad_norm": 2.087934732437134, + "learning_rate": 8.50664197751555e-05, + "loss": 0.0291, + "step": 8680 + }, + { + "epoch": 0.5101561582716919, + "grad_norm": 0.8386021256446838, + "learning_rate": 8.503344070352434e-05, + "loss": 0.068, + "step": 8690 + }, + { + "epoch": 0.510743219443466, + "grad_norm": 2.6947085857391357, + "learning_rate": 8.50004316649425e-05, + "loss": 0.0363, + "step": 8700 + }, + { + "epoch": 0.5113302806152401, + "grad_norm": 2.5281970500946045, + "learning_rate": 8.496739268764529e-05, + "loss": 0.0473, + "step": 8710 + }, + { + "epoch": 0.5119173417870142, + "grad_norm": 1.7762385606765747, + "learning_rate": 8.493432379989365e-05, + "loss": 0.0427, + "step": 8720 + }, + { + "epoch": 0.5125044029587883, + "grad_norm": 3.140371799468994, + "learning_rate": 8.490122502997406e-05, + "loss": 0.0347, + "step": 8730 + }, + { + "epoch": 0.5130914641305624, + "grad_norm": 1.5524842739105225, + "learning_rate": 8.486809640619859e-05, + "loss": 0.0272, + "step": 8740 + }, + { + "epoch": 0.5136785253023365, + "grad_norm": 1.11879563331604, + "learning_rate": 8.483493795690489e-05, + "loss": 0.0409, + "step": 8750 + }, + { + "epoch": 0.5142655864741106, + "grad_norm": 1.88312566280365, + "learning_rate": 8.480174971045603e-05, + "loss": 0.0711, + "step": 8760 + }, + { + "epoch": 0.5148526476458847, + "grad_norm": 2.326946496963501, + "learning_rate": 8.476853169524065e-05, + "loss": 0.0409, + "step": 8770 + }, + { + "epoch": 0.5154397088176588, + "grad_norm": 3.046353578567505, + "learning_rate": 8.473528393967278e-05, + "loss": 0.0588, + "step": 8780 + }, + { + "epoch": 0.5160267699894329, + "grad_norm": 0.6722098588943481, + "learning_rate": 8.470200647219198e-05, + "loss": 0.0395, + "step": 8790 + }, + { + "epoch": 0.516613831161207, + "grad_norm": 0.5529523491859436, + "learning_rate": 8.466869932126314e-05, + "loss": 0.0637, + "step": 8800 + }, + { + "epoch": 0.517200892332981, + "grad_norm": 2.5950729846954346, + "learning_rate": 8.463536251537656e-05, + "loss": 0.0444, + "step": 8810 + }, + { + "epoch": 0.5177879535047551, + "grad_norm": 4.526224136352539, + "learning_rate": 8.460199608304797e-05, + "loss": 0.0414, + "step": 8820 + }, + { + "epoch": 0.5183750146765292, + "grad_norm": 2.620432138442993, + "learning_rate": 8.456860005281835e-05, + "loss": 0.0439, + "step": 8830 + }, + { + "epoch": 0.5189620758483033, + "grad_norm": 2.836040496826172, + "learning_rate": 8.453517445325405e-05, + "loss": 0.0615, + "step": 8840 + }, + { + "epoch": 0.5195491370200775, + "grad_norm": 2.2003190517425537, + "learning_rate": 8.450171931294673e-05, + "loss": 0.0298, + "step": 8850 + }, + { + "epoch": 0.5201361981918516, + "grad_norm": 2.7201812267303467, + "learning_rate": 8.446823466051326e-05, + "loss": 0.0331, + "step": 8860 + }, + { + "epoch": 0.5207232593636257, + "grad_norm": 2.7622556686401367, + "learning_rate": 8.44347205245958e-05, + "loss": 0.0555, + "step": 8870 + }, + { + "epoch": 0.5213103205353998, + "grad_norm": 2.5372259616851807, + "learning_rate": 8.440117693386171e-05, + "loss": 0.0417, + "step": 8880 + }, + { + "epoch": 0.5218973817071739, + "grad_norm": 4.218869209289551, + "learning_rate": 8.436760391700355e-05, + "loss": 0.0486, + "step": 8890 + }, + { + "epoch": 0.522484442878948, + "grad_norm": 0.7023425102233887, + "learning_rate": 8.433400150273906e-05, + "loss": 0.0334, + "step": 8900 + }, + { + "epoch": 0.5230715040507221, + "grad_norm": 1.4348467588424683, + "learning_rate": 8.430036971981112e-05, + "loss": 0.0404, + "step": 8910 + }, + { + "epoch": 0.5236585652224962, + "grad_norm": 2.7175183296203613, + "learning_rate": 8.426670859698771e-05, + "loss": 0.0634, + "step": 8920 + }, + { + "epoch": 0.5242456263942703, + "grad_norm": 2.485643148422241, + "learning_rate": 8.423301816306193e-05, + "loss": 0.0328, + "step": 8930 + }, + { + "epoch": 0.5248326875660444, + "grad_norm": 1.6880441904067993, + "learning_rate": 8.419929844685197e-05, + "loss": 0.056, + "step": 8940 + }, + { + "epoch": 0.5254197487378185, + "grad_norm": 1.1850073337554932, + "learning_rate": 8.416554947720104e-05, + "loss": 0.0263, + "step": 8950 + }, + { + "epoch": 0.5260068099095926, + "grad_norm": 2.0261285305023193, + "learning_rate": 8.413177128297734e-05, + "loss": 0.0743, + "step": 8960 + }, + { + "epoch": 0.5265938710813667, + "grad_norm": 1.2257660627365112, + "learning_rate": 8.409796389307417e-05, + "loss": 0.0352, + "step": 8970 + }, + { + "epoch": 0.5271809322531408, + "grad_norm": 3.7465314865112305, + "learning_rate": 8.406412733640967e-05, + "loss": 0.0339, + "step": 8980 + }, + { + "epoch": 0.5277679934249149, + "grad_norm": 1.9524476528167725, + "learning_rate": 8.403026164192704e-05, + "loss": 0.0683, + "step": 8990 + }, + { + "epoch": 0.528355054596689, + "grad_norm": 3.1063032150268555, + "learning_rate": 8.399636683859437e-05, + "loss": 0.0487, + "step": 9000 + }, + { + "epoch": 0.528355054596689, + "eval_loss": 0.44510969519615173, + "eval_runtime": 269.6504, + "eval_samples_per_second": 3.505, + "eval_steps_per_second": 3.505, + "step": 9000 + }, + { + "epoch": 0.5289421157684631, + "grad_norm": 1.7905889749526978, + "learning_rate": 8.396244295540462e-05, + "loss": 0.0762, + "step": 9010 + }, + { + "epoch": 0.5295291769402372, + "grad_norm": 0.9049012660980225, + "learning_rate": 8.392849002137566e-05, + "loss": 0.0471, + "step": 9020 + }, + { + "epoch": 0.5301162381120113, + "grad_norm": 1.638697624206543, + "learning_rate": 8.389450806555017e-05, + "loss": 0.0526, + "step": 9030 + }, + { + "epoch": 0.5307032992837853, + "grad_norm": 1.720030426979065, + "learning_rate": 8.386049711699571e-05, + "loss": 0.0532, + "step": 9040 + }, + { + "epoch": 0.5312903604555594, + "grad_norm": 2.473928689956665, + "learning_rate": 8.38264572048046e-05, + "loss": 0.0332, + "step": 9050 + }, + { + "epoch": 0.5318774216273335, + "grad_norm": 2.829211950302124, + "learning_rate": 8.379238835809393e-05, + "loss": 0.0586, + "step": 9060 + }, + { + "epoch": 0.5324644827991076, + "grad_norm": 1.035154104232788, + "learning_rate": 8.37582906060056e-05, + "loss": 0.065, + "step": 9070 + }, + { + "epoch": 0.5330515439708817, + "grad_norm": 1.434273362159729, + "learning_rate": 8.372416397770613e-05, + "loss": 0.0353, + "step": 9080 + }, + { + "epoch": 0.5336386051426558, + "grad_norm": 0.6541376113891602, + "learning_rate": 8.369000850238683e-05, + "loss": 0.047, + "step": 9090 + }, + { + "epoch": 0.5342256663144299, + "grad_norm": 2.1451282501220703, + "learning_rate": 8.365582420926366e-05, + "loss": 0.0411, + "step": 9100 + }, + { + "epoch": 0.534812727486204, + "grad_norm": 3.246711492538452, + "learning_rate": 8.362161112757723e-05, + "loss": 0.0321, + "step": 9110 + }, + { + "epoch": 0.5353997886579782, + "grad_norm": 3.385925054550171, + "learning_rate": 8.358736928659274e-05, + "loss": 0.0315, + "step": 9120 + }, + { + "epoch": 0.5359868498297523, + "grad_norm": 2.667985200881958, + "learning_rate": 8.355309871560006e-05, + "loss": 0.0525, + "step": 9130 + }, + { + "epoch": 0.5365739110015264, + "grad_norm": 0.6501687169075012, + "learning_rate": 8.351879944391357e-05, + "loss": 0.0386, + "step": 9140 + }, + { + "epoch": 0.5371609721733005, + "grad_norm": 1.7853676080703735, + "learning_rate": 8.348447150087223e-05, + "loss": 0.0481, + "step": 9150 + }, + { + "epoch": 0.5377480333450746, + "grad_norm": 0.8585736751556396, + "learning_rate": 8.345011491583954e-05, + "loss": 0.0327, + "step": 9160 + }, + { + "epoch": 0.5383350945168487, + "grad_norm": 1.4739607572555542, + "learning_rate": 8.341572971820344e-05, + "loss": 0.0356, + "step": 9170 + }, + { + "epoch": 0.5389221556886228, + "grad_norm": 0.4403301775455475, + "learning_rate": 8.338131593737643e-05, + "loss": 0.0458, + "step": 9180 + }, + { + "epoch": 0.5395092168603969, + "grad_norm": 2.5945067405700684, + "learning_rate": 8.33468736027954e-05, + "loss": 0.0429, + "step": 9190 + }, + { + "epoch": 0.540096278032171, + "grad_norm": 2.551771640777588, + "learning_rate": 8.331240274392167e-05, + "loss": 0.0286, + "step": 9200 + }, + { + "epoch": 0.5406833392039451, + "grad_norm": 0.41668054461479187, + "learning_rate": 8.327790339024097e-05, + "loss": 0.0459, + "step": 9210 + }, + { + "epoch": 0.5412704003757192, + "grad_norm": 3.3726375102996826, + "learning_rate": 8.324337557126342e-05, + "loss": 0.0612, + "step": 9220 + }, + { + "epoch": 0.5418574615474933, + "grad_norm": 2.2610208988189697, + "learning_rate": 8.320881931652347e-05, + "loss": 0.0482, + "step": 9230 + }, + { + "epoch": 0.5424445227192674, + "grad_norm": 3.177828311920166, + "learning_rate": 8.317423465557987e-05, + "loss": 0.051, + "step": 9240 + }, + { + "epoch": 0.5430315838910414, + "grad_norm": 0.9072557091712952, + "learning_rate": 8.313962161801569e-05, + "loss": 0.0479, + "step": 9250 + }, + { + "epoch": 0.5436186450628155, + "grad_norm": 0.45060959458351135, + "learning_rate": 8.310498023343832e-05, + "loss": 0.0389, + "step": 9260 + }, + { + "epoch": 0.5442057062345896, + "grad_norm": 3.351407527923584, + "learning_rate": 8.307031053147932e-05, + "loss": 0.0598, + "step": 9270 + }, + { + "epoch": 0.5447927674063637, + "grad_norm": 2.226046562194824, + "learning_rate": 8.30356125417945e-05, + "loss": 0.0416, + "step": 9280 + }, + { + "epoch": 0.5453798285781378, + "grad_norm": 2.5164947509765625, + "learning_rate": 8.300088629406391e-05, + "loss": 0.0609, + "step": 9290 + }, + { + "epoch": 0.5459668897499119, + "grad_norm": 2.2050812244415283, + "learning_rate": 8.296613181799168e-05, + "loss": 0.0544, + "step": 9300 + }, + { + "epoch": 0.546553950921686, + "grad_norm": 1.6396516561508179, + "learning_rate": 8.293134914330618e-05, + "loss": 0.0577, + "step": 9310 + }, + { + "epoch": 0.5471410120934601, + "grad_norm": 0.5308670997619629, + "learning_rate": 8.289653829975983e-05, + "loss": 0.0249, + "step": 9320 + }, + { + "epoch": 0.5477280732652342, + "grad_norm": 2.1728813648223877, + "learning_rate": 8.286169931712921e-05, + "loss": 0.0487, + "step": 9330 + }, + { + "epoch": 0.5483151344370083, + "grad_norm": 1.4126052856445312, + "learning_rate": 8.28268322252149e-05, + "loss": 0.0242, + "step": 9340 + }, + { + "epoch": 0.5489021956087824, + "grad_norm": 1.1930028200149536, + "learning_rate": 8.279193705384159e-05, + "loss": 0.0456, + "step": 9350 + }, + { + "epoch": 0.5494892567805565, + "grad_norm": 2.592273235321045, + "learning_rate": 8.275701383285795e-05, + "loss": 0.0603, + "step": 9360 + }, + { + "epoch": 0.5500763179523306, + "grad_norm": 4.0102128982543945, + "learning_rate": 8.272206259213662e-05, + "loss": 0.0591, + "step": 9370 + }, + { + "epoch": 0.5506633791241047, + "grad_norm": 3.490880012512207, + "learning_rate": 8.268708336157428e-05, + "loss": 0.0384, + "step": 9380 + }, + { + "epoch": 0.5512504402958789, + "grad_norm": 3.501849889755249, + "learning_rate": 8.265207617109148e-05, + "loss": 0.0397, + "step": 9390 + }, + { + "epoch": 0.551837501467653, + "grad_norm": 1.536889910697937, + "learning_rate": 8.261704105063275e-05, + "loss": 0.0364, + "step": 9400 + }, + { + "epoch": 0.5524245626394271, + "grad_norm": 1.7452349662780762, + "learning_rate": 8.258197803016646e-05, + "loss": 0.0466, + "step": 9410 + }, + { + "epoch": 0.5530116238112012, + "grad_norm": 4.080146789550781, + "learning_rate": 8.254688713968484e-05, + "loss": 0.0566, + "step": 9420 + }, + { + "epoch": 0.5535986849829753, + "grad_norm": 1.8155280351638794, + "learning_rate": 8.2511768409204e-05, + "loss": 0.0258, + "step": 9430 + }, + { + "epoch": 0.5541857461547494, + "grad_norm": 0.7214668989181519, + "learning_rate": 8.247662186876386e-05, + "loss": 0.0609, + "step": 9440 + }, + { + "epoch": 0.5547728073265235, + "grad_norm": 2.605099678039551, + "learning_rate": 8.244144754842809e-05, + "loss": 0.0304, + "step": 9450 + }, + { + "epoch": 0.5553598684982975, + "grad_norm": 2.348184823989868, + "learning_rate": 8.240624547828417e-05, + "loss": 0.0601, + "step": 9460 + }, + { + "epoch": 0.5559469296700716, + "grad_norm": 1.3189982175827026, + "learning_rate": 8.237101568844328e-05, + "loss": 0.0498, + "step": 9470 + }, + { + "epoch": 0.5565339908418457, + "grad_norm": 2.5813090801239014, + "learning_rate": 8.233575820904032e-05, + "loss": 0.0587, + "step": 9480 + }, + { + "epoch": 0.5571210520136198, + "grad_norm": 3.1107876300811768, + "learning_rate": 8.23004730702339e-05, + "loss": 0.0665, + "step": 9490 + }, + { + "epoch": 0.5577081131853939, + "grad_norm": 0.7311292290687561, + "learning_rate": 8.226516030220623e-05, + "loss": 0.0451, + "step": 9500 + }, + { + "epoch": 0.558295174357168, + "grad_norm": 1.5079210996627808, + "learning_rate": 8.222981993516324e-05, + "loss": 0.0621, + "step": 9510 + }, + { + "epoch": 0.5588822355289421, + "grad_norm": 2.4844210147857666, + "learning_rate": 8.219445199933437e-05, + "loss": 0.0425, + "step": 9520 + }, + { + "epoch": 0.5594692967007162, + "grad_norm": 1.9239799976348877, + "learning_rate": 8.215905652497273e-05, + "loss": 0.0581, + "step": 9530 + }, + { + "epoch": 0.5600563578724903, + "grad_norm": 1.2899315357208252, + "learning_rate": 8.212363354235494e-05, + "loss": 0.0308, + "step": 9540 + }, + { + "epoch": 0.5606434190442644, + "grad_norm": 1.7348955869674683, + "learning_rate": 8.208818308178114e-05, + "loss": 0.0491, + "step": 9550 + }, + { + "epoch": 0.5612304802160385, + "grad_norm": 1.6830767393112183, + "learning_rate": 8.205270517357502e-05, + "loss": 0.0495, + "step": 9560 + }, + { + "epoch": 0.5618175413878126, + "grad_norm": 2.3700168132781982, + "learning_rate": 8.201719984808369e-05, + "loss": 0.0423, + "step": 9570 + }, + { + "epoch": 0.5624046025595867, + "grad_norm": 3.7524573802948, + "learning_rate": 8.198166713567777e-05, + "loss": 0.0618, + "step": 9580 + }, + { + "epoch": 0.5629916637313608, + "grad_norm": 0.6316405534744263, + "learning_rate": 8.194610706675125e-05, + "loss": 0.0315, + "step": 9590 + }, + { + "epoch": 0.5635787249031349, + "grad_norm": 1.4591659307479858, + "learning_rate": 8.191051967172157e-05, + "loss": 0.0439, + "step": 9600 + }, + { + "epoch": 0.564165786074909, + "grad_norm": 0.6856974363327026, + "learning_rate": 8.18749049810295e-05, + "loss": 0.0483, + "step": 9610 + }, + { + "epoch": 0.5647528472466831, + "grad_norm": 3.4433891773223877, + "learning_rate": 8.183926302513923e-05, + "loss": 0.0445, + "step": 9620 + }, + { + "epoch": 0.5653399084184572, + "grad_norm": 1.6874566078186035, + "learning_rate": 8.180359383453815e-05, + "loss": 0.0468, + "step": 9630 + }, + { + "epoch": 0.5659269695902313, + "grad_norm": 2.5820140838623047, + "learning_rate": 8.176789743973707e-05, + "loss": 0.0367, + "step": 9640 + }, + { + "epoch": 0.5665140307620053, + "grad_norm": 3.7962241172790527, + "learning_rate": 8.173217387127004e-05, + "loss": 0.0392, + "step": 9650 + }, + { + "epoch": 0.5671010919337796, + "grad_norm": 2.679847240447998, + "learning_rate": 8.169642315969427e-05, + "loss": 0.0253, + "step": 9660 + }, + { + "epoch": 0.5676881531055537, + "grad_norm": 2.028122901916504, + "learning_rate": 8.166064533559028e-05, + "loss": 0.0734, + "step": 9670 + }, + { + "epoch": 0.5682752142773277, + "grad_norm": 2.9593400955200195, + "learning_rate": 8.162484042956178e-05, + "loss": 0.0219, + "step": 9680 + }, + { + "epoch": 0.5688622754491018, + "grad_norm": 1.2236310243606567, + "learning_rate": 8.158900847223556e-05, + "loss": 0.0289, + "step": 9690 + }, + { + "epoch": 0.5694493366208759, + "grad_norm": 1.4059587717056274, + "learning_rate": 8.155314949426167e-05, + "loss": 0.0306, + "step": 9700 + }, + { + "epoch": 0.57003639779265, + "grad_norm": 4.031301975250244, + "learning_rate": 8.151726352631316e-05, + "loss": 0.0364, + "step": 9710 + }, + { + "epoch": 0.5706234589644241, + "grad_norm": 3.6776328086853027, + "learning_rate": 8.148135059908624e-05, + "loss": 0.0351, + "step": 9720 + }, + { + "epoch": 0.5712105201361982, + "grad_norm": 0.8490133285522461, + "learning_rate": 8.144541074330015e-05, + "loss": 0.0398, + "step": 9730 + }, + { + "epoch": 0.5717975813079723, + "grad_norm": 1.5910861492156982, + "learning_rate": 8.140944398969717e-05, + "loss": 0.0396, + "step": 9740 + }, + { + "epoch": 0.5723846424797464, + "grad_norm": 0.7195848822593689, + "learning_rate": 8.13734503690426e-05, + "loss": 0.0429, + "step": 9750 + }, + { + "epoch": 0.5729717036515205, + "grad_norm": 1.655400037765503, + "learning_rate": 8.13374299121247e-05, + "loss": 0.0551, + "step": 9760 + }, + { + "epoch": 0.5735587648232946, + "grad_norm": 0.9556081891059875, + "learning_rate": 8.130138264975471e-05, + "loss": 0.0505, + "step": 9770 + }, + { + "epoch": 0.5741458259950687, + "grad_norm": 1.830175757408142, + "learning_rate": 8.126530861276677e-05, + "loss": 0.0361, + "step": 9780 + }, + { + "epoch": 0.5747328871668428, + "grad_norm": 3.475700855255127, + "learning_rate": 8.122920783201793e-05, + "loss": 0.0206, + "step": 9790 + }, + { + "epoch": 0.5753199483386169, + "grad_norm": 1.2503710985183716, + "learning_rate": 8.119308033838814e-05, + "loss": 0.0685, + "step": 9800 + }, + { + "epoch": 0.575907009510391, + "grad_norm": 5.604869842529297, + "learning_rate": 8.115692616278018e-05, + "loss": 0.0688, + "step": 9810 + }, + { + "epoch": 0.5764940706821651, + "grad_norm": 2.3936383724212646, + "learning_rate": 8.112074533611967e-05, + "loss": 0.0639, + "step": 9820 + }, + { + "epoch": 0.5770811318539392, + "grad_norm": 3.6807427406311035, + "learning_rate": 8.108453788935498e-05, + "loss": 0.0428, + "step": 9830 + }, + { + "epoch": 0.5776681930257133, + "grad_norm": 1.175811767578125, + "learning_rate": 8.10483038534573e-05, + "loss": 0.0579, + "step": 9840 + }, + { + "epoch": 0.5782552541974874, + "grad_norm": 3.283430337905884, + "learning_rate": 8.101204325942056e-05, + "loss": 0.0532, + "step": 9850 + }, + { + "epoch": 0.5788423153692615, + "grad_norm": 4.026200771331787, + "learning_rate": 8.097575613826136e-05, + "loss": 0.0328, + "step": 9860 + }, + { + "epoch": 0.5794293765410355, + "grad_norm": 2.251068353652954, + "learning_rate": 8.093944252101907e-05, + "loss": 0.0543, + "step": 9870 + }, + { + "epoch": 0.5800164377128096, + "grad_norm": 3.2121071815490723, + "learning_rate": 8.090310243875565e-05, + "loss": 0.0619, + "step": 9880 + }, + { + "epoch": 0.5806034988845837, + "grad_norm": 1.9805493354797363, + "learning_rate": 8.086673592255573e-05, + "loss": 0.0463, + "step": 9890 + }, + { + "epoch": 0.5811905600563578, + "grad_norm": 2.9728682041168213, + "learning_rate": 8.083034300352657e-05, + "loss": 0.0575, + "step": 9900 + }, + { + "epoch": 0.5817776212281319, + "grad_norm": 1.6561614274978638, + "learning_rate": 8.079392371279797e-05, + "loss": 0.0478, + "step": 9910 + }, + { + "epoch": 0.582364682399906, + "grad_norm": 2.3436312675476074, + "learning_rate": 8.075747808152231e-05, + "loss": 0.0742, + "step": 9920 + }, + { + "epoch": 0.5829517435716801, + "grad_norm": 2.0365006923675537, + "learning_rate": 8.072100614087453e-05, + "loss": 0.0355, + "step": 9930 + }, + { + "epoch": 0.5835388047434543, + "grad_norm": 2.1946043968200684, + "learning_rate": 8.068450792205202e-05, + "loss": 0.022, + "step": 9940 + }, + { + "epoch": 0.5841258659152284, + "grad_norm": 3.2829816341400146, + "learning_rate": 8.064798345627468e-05, + "loss": 0.045, + "step": 9950 + }, + { + "epoch": 0.5847129270870025, + "grad_norm": 1.0017696619033813, + "learning_rate": 8.061143277478486e-05, + "loss": 0.0498, + "step": 9960 + }, + { + "epoch": 0.5852999882587766, + "grad_norm": 1.9448410272598267, + "learning_rate": 8.057485590884733e-05, + "loss": 0.0428, + "step": 9970 + }, + { + "epoch": 0.5858870494305507, + "grad_norm": 1.5764429569244385, + "learning_rate": 8.053825288974924e-05, + "loss": 0.058, + "step": 9980 + }, + { + "epoch": 0.5864741106023248, + "grad_norm": 3.9513072967529297, + "learning_rate": 8.050162374880015e-05, + "loss": 0.0606, + "step": 9990 + }, + { + "epoch": 0.5870611717740989, + "grad_norm": 2.3251681327819824, + "learning_rate": 8.046496851733193e-05, + "loss": 0.0294, + "step": 10000 + }, + { + "epoch": 0.587648232945873, + "grad_norm": 1.103040099143982, + "learning_rate": 8.042828722669882e-05, + "loss": 0.0439, + "step": 10010 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 5.0034685134887695, + "learning_rate": 8.039157990827727e-05, + "loss": 0.0339, + "step": 10020 + }, + { + "epoch": 0.5888223552894212, + "grad_norm": 1.0279072523117065, + "learning_rate": 8.0354846593466e-05, + "loss": 0.0549, + "step": 10030 + }, + { + "epoch": 0.5894094164611953, + "grad_norm": 3.0541296005249023, + "learning_rate": 8.031808731368608e-05, + "loss": 0.0656, + "step": 10040 + }, + { + "epoch": 0.5899964776329694, + "grad_norm": 1.1822483539581299, + "learning_rate": 8.028130210038067e-05, + "loss": 0.0358, + "step": 10050 + }, + { + "epoch": 0.5905835388047435, + "grad_norm": 1.4830501079559326, + "learning_rate": 8.024449098501514e-05, + "loss": 0.0472, + "step": 10060 + }, + { + "epoch": 0.5911705999765176, + "grad_norm": 0.9620664715766907, + "learning_rate": 8.020765399907707e-05, + "loss": 0.0366, + "step": 10070 + }, + { + "epoch": 0.5917576611482916, + "grad_norm": 2.6279451847076416, + "learning_rate": 8.017079117407611e-05, + "loss": 0.031, + "step": 10080 + }, + { + "epoch": 0.5923447223200657, + "grad_norm": 1.0248329639434814, + "learning_rate": 8.013390254154402e-05, + "loss": 0.0302, + "step": 10090 + }, + { + "epoch": 0.5929317834918398, + "grad_norm": 4.71444034576416, + "learning_rate": 8.009698813303465e-05, + "loss": 0.0609, + "step": 10100 + }, + { + "epoch": 0.5935188446636139, + "grad_norm": 1.409204363822937, + "learning_rate": 8.006004798012393e-05, + "loss": 0.0267, + "step": 10110 + }, + { + "epoch": 0.594105905835388, + "grad_norm": 5.488358974456787, + "learning_rate": 8.002308211440974e-05, + "loss": 0.0621, + "step": 10120 + }, + { + "epoch": 0.5946929670071621, + "grad_norm": 1.6564887762069702, + "learning_rate": 7.998609056751199e-05, + "loss": 0.0386, + "step": 10130 + }, + { + "epoch": 0.5952800281789362, + "grad_norm": 2.0165855884552, + "learning_rate": 7.994907337107258e-05, + "loss": 0.0417, + "step": 10140 + }, + { + "epoch": 0.5958670893507103, + "grad_norm": 1.646742343902588, + "learning_rate": 7.991203055675532e-05, + "loss": 0.0542, + "step": 10150 + }, + { + "epoch": 0.5964541505224844, + "grad_norm": 2.7396490573883057, + "learning_rate": 7.987496215624593e-05, + "loss": 0.0617, + "step": 10160 + }, + { + "epoch": 0.5970412116942585, + "grad_norm": 4.674464702606201, + "learning_rate": 7.983786820125204e-05, + "loss": 0.0545, + "step": 10170 + }, + { + "epoch": 0.5976282728660326, + "grad_norm": 1.177596092224121, + "learning_rate": 7.980074872350312e-05, + "loss": 0.0369, + "step": 10180 + }, + { + "epoch": 0.5982153340378067, + "grad_norm": 1.4684791564941406, + "learning_rate": 7.976360375475047e-05, + "loss": 0.04, + "step": 10190 + }, + { + "epoch": 0.5988023952095808, + "grad_norm": 4.104825496673584, + "learning_rate": 7.972643332676723e-05, + "loss": 0.0356, + "step": 10200 + }, + { + "epoch": 0.599389456381355, + "grad_norm": 1.0452766418457031, + "learning_rate": 7.968923747134825e-05, + "loss": 0.037, + "step": 10210 + }, + { + "epoch": 0.5999765175531291, + "grad_norm": 1.7485004663467407, + "learning_rate": 7.965201622031021e-05, + "loss": 0.033, + "step": 10220 + }, + { + "epoch": 0.6005635787249032, + "grad_norm": 1.3193644285202026, + "learning_rate": 7.961476960549145e-05, + "loss": 0.0455, + "step": 10230 + }, + { + "epoch": 0.6011506398966773, + "grad_norm": 1.8149995803833008, + "learning_rate": 7.957749765875204e-05, + "loss": 0.0353, + "step": 10240 + }, + { + "epoch": 0.6017377010684514, + "grad_norm": 3.0644371509552, + "learning_rate": 7.954020041197369e-05, + "loss": 0.0354, + "step": 10250 + }, + { + "epoch": 0.6023247622402255, + "grad_norm": 2.376058340072632, + "learning_rate": 7.950287789705977e-05, + "loss": 0.0554, + "step": 10260 + }, + { + "epoch": 0.6029118234119996, + "grad_norm": 2.0536141395568848, + "learning_rate": 7.94655301459353e-05, + "loss": 0.05, + "step": 10270 + }, + { + "epoch": 0.6034988845837737, + "grad_norm": 1.821018934249878, + "learning_rate": 7.942815719054679e-05, + "loss": 0.0471, + "step": 10280 + }, + { + "epoch": 0.6040859457555477, + "grad_norm": 1.5516284704208374, + "learning_rate": 7.939075906286241e-05, + "loss": 0.0455, + "step": 10290 + }, + { + "epoch": 0.6046730069273218, + "grad_norm": 4.660377025604248, + "learning_rate": 7.935333579487179e-05, + "loss": 0.0454, + "step": 10300 + }, + { + "epoch": 0.6052600680990959, + "grad_norm": 1.256773591041565, + "learning_rate": 7.931588741858612e-05, + "loss": 0.0406, + "step": 10310 + }, + { + "epoch": 0.60584712927087, + "grad_norm": 4.528842926025391, + "learning_rate": 7.927841396603804e-05, + "loss": 0.0528, + "step": 10320 + }, + { + "epoch": 0.6064341904426441, + "grad_norm": 1.1366862058639526, + "learning_rate": 7.924091546928163e-05, + "loss": 0.038, + "step": 10330 + }, + { + "epoch": 0.6070212516144182, + "grad_norm": 1.158761978149414, + "learning_rate": 7.920339196039239e-05, + "loss": 0.0514, + "step": 10340 + }, + { + "epoch": 0.6076083127861923, + "grad_norm": 2.8276240825653076, + "learning_rate": 7.916584347146728e-05, + "loss": 0.0446, + "step": 10350 + }, + { + "epoch": 0.6081953739579664, + "grad_norm": 1.485958218574524, + "learning_rate": 7.912827003462451e-05, + "loss": 0.048, + "step": 10360 + }, + { + "epoch": 0.6087824351297405, + "grad_norm": 2.347153425216675, + "learning_rate": 7.909067168200375e-05, + "loss": 0.0504, + "step": 10370 + }, + { + "epoch": 0.6093694963015146, + "grad_norm": 2.4298806190490723, + "learning_rate": 7.905304844576589e-05, + "loss": 0.0303, + "step": 10380 + }, + { + "epoch": 0.6099565574732887, + "grad_norm": 1.4049865007400513, + "learning_rate": 7.901540035809316e-05, + "loss": 0.0595, + "step": 10390 + }, + { + "epoch": 0.6105436186450628, + "grad_norm": 1.0412266254425049, + "learning_rate": 7.897772745118903e-05, + "loss": 0.0341, + "step": 10400 + }, + { + "epoch": 0.6111306798168369, + "grad_norm": 4.007213592529297, + "learning_rate": 7.89400297572782e-05, + "loss": 0.0698, + "step": 10410 + }, + { + "epoch": 0.611717740988611, + "grad_norm": 2.98750376701355, + "learning_rate": 7.890230730860657e-05, + "loss": 0.0546, + "step": 10420 + }, + { + "epoch": 0.6123048021603851, + "grad_norm": 4.189070224761963, + "learning_rate": 7.886456013744124e-05, + "loss": 0.0641, + "step": 10430 + }, + { + "epoch": 0.6128918633321592, + "grad_norm": 3.3929412364959717, + "learning_rate": 7.88267882760704e-05, + "loss": 0.0735, + "step": 10440 + }, + { + "epoch": 0.6134789245039333, + "grad_norm": 1.317287564277649, + "learning_rate": 7.878899175680341e-05, + "loss": 0.0211, + "step": 10450 + }, + { + "epoch": 0.6140659856757074, + "grad_norm": 0.4969087839126587, + "learning_rate": 7.875117061197071e-05, + "loss": 0.0454, + "step": 10460 + }, + { + "epoch": 0.6146530468474815, + "grad_norm": 4.775953769683838, + "learning_rate": 7.87133248739238e-05, + "loss": 0.0457, + "step": 10470 + }, + { + "epoch": 0.6152401080192557, + "grad_norm": 2.372785806655884, + "learning_rate": 7.867545457503521e-05, + "loss": 0.0398, + "step": 10480 + }, + { + "epoch": 0.6158271691910298, + "grad_norm": 2.851694107055664, + "learning_rate": 7.863755974769851e-05, + "loss": 0.0601, + "step": 10490 + }, + { + "epoch": 0.6164142303628038, + "grad_norm": 1.553295612335205, + "learning_rate": 7.859964042432819e-05, + "loss": 0.0742, + "step": 10500 + }, + { + "epoch": 0.617001291534578, + "grad_norm": 2.219014883041382, + "learning_rate": 7.856169663735975e-05, + "loss": 0.0496, + "step": 10510 + }, + { + "epoch": 0.617588352706352, + "grad_norm": 3.396904230117798, + "learning_rate": 7.852372841924961e-05, + "loss": 0.0375, + "step": 10520 + }, + { + "epoch": 0.6181754138781261, + "grad_norm": 1.845604658126831, + "learning_rate": 7.848573580247505e-05, + "loss": 0.0423, + "step": 10530 + }, + { + "epoch": 0.6187624750499002, + "grad_norm": 2.3468329906463623, + "learning_rate": 7.844771881953425e-05, + "loss": 0.0428, + "step": 10540 + }, + { + "epoch": 0.6193495362216743, + "grad_norm": 3.744361162185669, + "learning_rate": 7.840967750294626e-05, + "loss": 0.0537, + "step": 10550 + }, + { + "epoch": 0.6199365973934484, + "grad_norm": 2.348733425140381, + "learning_rate": 7.837161188525087e-05, + "loss": 0.0297, + "step": 10560 + }, + { + "epoch": 0.6205236585652225, + "grad_norm": 1.5336097478866577, + "learning_rate": 7.83335219990087e-05, + "loss": 0.0325, + "step": 10570 + }, + { + "epoch": 0.6211107197369966, + "grad_norm": 2.7004475593566895, + "learning_rate": 7.829540787680114e-05, + "loss": 0.0458, + "step": 10580 + }, + { + "epoch": 0.6216977809087707, + "grad_norm": 0.5332560539245605, + "learning_rate": 7.82572695512303e-05, + "loss": 0.0283, + "step": 10590 + }, + { + "epoch": 0.6222848420805448, + "grad_norm": 3.654900550842285, + "learning_rate": 7.8219107054919e-05, + "loss": 0.0369, + "step": 10600 + }, + { + "epoch": 0.6228719032523189, + "grad_norm": 2.0149872303009033, + "learning_rate": 7.818092042051071e-05, + "loss": 0.0563, + "step": 10610 + }, + { + "epoch": 0.623458964424093, + "grad_norm": 1.153555154800415, + "learning_rate": 7.814270968066956e-05, + "loss": 0.0538, + "step": 10620 + }, + { + "epoch": 0.6240460255958671, + "grad_norm": 4.492584705352783, + "learning_rate": 7.810447486808032e-05, + "loss": 0.0557, + "step": 10630 + }, + { + "epoch": 0.6246330867676412, + "grad_norm": 4.326468467712402, + "learning_rate": 7.806621601544832e-05, + "loss": 0.0405, + "step": 10640 + }, + { + "epoch": 0.6252201479394153, + "grad_norm": 2.076035499572754, + "learning_rate": 7.802793315549948e-05, + "loss": 0.0468, + "step": 10650 + }, + { + "epoch": 0.6258072091111894, + "grad_norm": 3.4883980751037598, + "learning_rate": 7.798962632098024e-05, + "loss": 0.0497, + "step": 10660 + }, + { + "epoch": 0.6263942702829635, + "grad_norm": 2.0908708572387695, + "learning_rate": 7.795129554465754e-05, + "loss": 0.0551, + "step": 10670 + }, + { + "epoch": 0.6269813314547376, + "grad_norm": 3.256268262863159, + "learning_rate": 7.791294085931882e-05, + "loss": 0.0359, + "step": 10680 + }, + { + "epoch": 0.6275683926265117, + "grad_norm": 2.078904867172241, + "learning_rate": 7.787456229777196e-05, + "loss": 0.059, + "step": 10690 + }, + { + "epoch": 0.6281554537982857, + "grad_norm": 0.706121563911438, + "learning_rate": 7.783615989284527e-05, + "loss": 0.0184, + "step": 10700 + }, + { + "epoch": 0.6287425149700598, + "grad_norm": 2.253751754760742, + "learning_rate": 7.779773367738743e-05, + "loss": 0.0463, + "step": 10710 + }, + { + "epoch": 0.6293295761418339, + "grad_norm": 1.3410460948944092, + "learning_rate": 7.775928368426751e-05, + "loss": 0.0303, + "step": 10720 + }, + { + "epoch": 0.629916637313608, + "grad_norm": 2.1979212760925293, + "learning_rate": 7.772080994637494e-05, + "loss": 0.0611, + "step": 10730 + }, + { + "epoch": 0.6305036984853821, + "grad_norm": 2.3805341720581055, + "learning_rate": 7.768231249661942e-05, + "loss": 0.0401, + "step": 10740 + }, + { + "epoch": 0.6310907596571563, + "grad_norm": 1.889790654182434, + "learning_rate": 7.764379136793096e-05, + "loss": 0.0368, + "step": 10750 + }, + { + "epoch": 0.6316778208289304, + "grad_norm": 2.839601516723633, + "learning_rate": 7.760524659325981e-05, + "loss": 0.0513, + "step": 10760 + }, + { + "epoch": 0.6322648820007045, + "grad_norm": 0.5804122686386108, + "learning_rate": 7.756667820557644e-05, + "loss": 0.0489, + "step": 10770 + }, + { + "epoch": 0.6328519431724786, + "grad_norm": 0.8815114498138428, + "learning_rate": 7.752808623787152e-05, + "loss": 0.0525, + "step": 10780 + }, + { + "epoch": 0.6334390043442527, + "grad_norm": 1.7328076362609863, + "learning_rate": 7.748947072315592e-05, + "loss": 0.0329, + "step": 10790 + }, + { + "epoch": 0.6340260655160268, + "grad_norm": 1.2176601886749268, + "learning_rate": 7.745083169446064e-05, + "loss": 0.0308, + "step": 10800 + }, + { + "epoch": 0.6346131266878009, + "grad_norm": 4.364448547363281, + "learning_rate": 7.741216918483674e-05, + "loss": 0.0378, + "step": 10810 + }, + { + "epoch": 0.635200187859575, + "grad_norm": 3.1577255725860596, + "learning_rate": 7.737348322735545e-05, + "loss": 0.0526, + "step": 10820 + }, + { + "epoch": 0.6357872490313491, + "grad_norm": 1.5735069513320923, + "learning_rate": 7.7334773855108e-05, + "loss": 0.0534, + "step": 10830 + }, + { + "epoch": 0.6363743102031232, + "grad_norm": 1.8642213344573975, + "learning_rate": 7.729604110120564e-05, + "loss": 0.0411, + "step": 10840 + }, + { + "epoch": 0.6369613713748973, + "grad_norm": 2.4640445709228516, + "learning_rate": 7.725728499877967e-05, + "loss": 0.046, + "step": 10850 + }, + { + "epoch": 0.6375484325466714, + "grad_norm": 2.915814161300659, + "learning_rate": 7.721850558098136e-05, + "loss": 0.0391, + "step": 10860 + }, + { + "epoch": 0.6381354937184455, + "grad_norm": 3.9695358276367188, + "learning_rate": 7.717970288098184e-05, + "loss": 0.0569, + "step": 10870 + }, + { + "epoch": 0.6387225548902196, + "grad_norm": 1.0098638534545898, + "learning_rate": 7.714087693197227e-05, + "loss": 0.0359, + "step": 10880 + }, + { + "epoch": 0.6393096160619937, + "grad_norm": 2.7655649185180664, + "learning_rate": 7.710202776716362e-05, + "loss": 0.0463, + "step": 10890 + }, + { + "epoch": 0.6398966772337678, + "grad_norm": 2.5925843715667725, + "learning_rate": 7.706315541978673e-05, + "loss": 0.0474, + "step": 10900 + }, + { + "epoch": 0.6404837384055418, + "grad_norm": 1.8666685819625854, + "learning_rate": 7.702425992309229e-05, + "loss": 0.0515, + "step": 10910 + }, + { + "epoch": 0.6410707995773159, + "grad_norm": 0.22611987590789795, + "learning_rate": 7.698534131035077e-05, + "loss": 0.0389, + "step": 10920 + }, + { + "epoch": 0.64165786074909, + "grad_norm": 1.4322317838668823, + "learning_rate": 7.694639961485246e-05, + "loss": 0.0311, + "step": 10930 + }, + { + "epoch": 0.6422449219208641, + "grad_norm": 1.4518626928329468, + "learning_rate": 7.69074348699073e-05, + "loss": 0.0391, + "step": 10940 + }, + { + "epoch": 0.6428319830926382, + "grad_norm": 2.698389768600464, + "learning_rate": 7.686844710884506e-05, + "loss": 0.0352, + "step": 10950 + }, + { + "epoch": 0.6434190442644123, + "grad_norm": 1.6068532466888428, + "learning_rate": 7.682943636501512e-05, + "loss": 0.0476, + "step": 10960 + }, + { + "epoch": 0.6440061054361864, + "grad_norm": 1.8866565227508545, + "learning_rate": 7.679040267178653e-05, + "loss": 0.0215, + "step": 10970 + }, + { + "epoch": 0.6445931666079605, + "grad_norm": 2.0535993576049805, + "learning_rate": 7.675134606254799e-05, + "loss": 0.0689, + "step": 10980 + }, + { + "epoch": 0.6451802277797346, + "grad_norm": 0.8703666925430298, + "learning_rate": 7.67122665707078e-05, + "loss": 0.0336, + "step": 10990 + }, + { + "epoch": 0.6457672889515087, + "grad_norm": 3.0260536670684814, + "learning_rate": 7.667316422969383e-05, + "loss": 0.053, + "step": 11000 + }, + { + "epoch": 0.6463543501232828, + "grad_norm": 0.925990641117096, + "learning_rate": 7.663403907295348e-05, + "loss": 0.0223, + "step": 11010 + }, + { + "epoch": 0.6469414112950569, + "grad_norm": 2.501262664794922, + "learning_rate": 7.65948911339537e-05, + "loss": 0.0484, + "step": 11020 + }, + { + "epoch": 0.6475284724668311, + "grad_norm": 2.3232693672180176, + "learning_rate": 7.655572044618086e-05, + "loss": 0.0504, + "step": 11030 + }, + { + "epoch": 0.6481155336386052, + "grad_norm": 3.446190357208252, + "learning_rate": 7.65165270431409e-05, + "loss": 0.0275, + "step": 11040 + }, + { + "epoch": 0.6487025948103793, + "grad_norm": 2.269209861755371, + "learning_rate": 7.647731095835906e-05, + "loss": 0.0401, + "step": 11050 + }, + { + "epoch": 0.6492896559821534, + "grad_norm": 1.462114930152893, + "learning_rate": 7.64380722253801e-05, + "loss": 0.0355, + "step": 11060 + }, + { + "epoch": 0.6498767171539275, + "grad_norm": 2.42257022857666, + "learning_rate": 7.639881087776807e-05, + "loss": 0.0463, + "step": 11070 + }, + { + "epoch": 0.6504637783257016, + "grad_norm": 5.560107231140137, + "learning_rate": 7.635952694910637e-05, + "loss": 0.035, + "step": 11080 + }, + { + "epoch": 0.6510508394974757, + "grad_norm": 2.6349921226501465, + "learning_rate": 7.632022047299781e-05, + "loss": 0.0316, + "step": 11090 + }, + { + "epoch": 0.6516379006692498, + "grad_norm": 2.049649715423584, + "learning_rate": 7.628089148306434e-05, + "loss": 0.0467, + "step": 11100 + }, + { + "epoch": 0.6522249618410239, + "grad_norm": 1.0041853189468384, + "learning_rate": 7.624154001294729e-05, + "loss": 0.0445, + "step": 11110 + }, + { + "epoch": 0.652812023012798, + "grad_norm": 0.8886867165565491, + "learning_rate": 7.620216609630715e-05, + "loss": 0.0431, + "step": 11120 + }, + { + "epoch": 0.653399084184572, + "grad_norm": 0.880000114440918, + "learning_rate": 7.616276976682365e-05, + "loss": 0.0664, + "step": 11130 + }, + { + "epoch": 0.6539861453563461, + "grad_norm": 1.4624862670898438, + "learning_rate": 7.612335105819565e-05, + "loss": 0.0207, + "step": 11140 + }, + { + "epoch": 0.6545732065281202, + "grad_norm": 1.1052141189575195, + "learning_rate": 7.608391000414118e-05, + "loss": 0.0447, + "step": 11150 + }, + { + "epoch": 0.6551602676998943, + "grad_norm": 2.376422166824341, + "learning_rate": 7.604444663839743e-05, + "loss": 0.0379, + "step": 11160 + }, + { + "epoch": 0.6557473288716684, + "grad_norm": 1.6930925846099854, + "learning_rate": 7.600496099472057e-05, + "loss": 0.0634, + "step": 11170 + }, + { + "epoch": 0.6563343900434425, + "grad_norm": 0.8461235165596008, + "learning_rate": 7.59654531068859e-05, + "loss": 0.0459, + "step": 11180 + }, + { + "epoch": 0.6569214512152166, + "grad_norm": 1.6959190368652344, + "learning_rate": 7.592592300868774e-05, + "loss": 0.0364, + "step": 11190 + }, + { + "epoch": 0.6575085123869907, + "grad_norm": 3.24035906791687, + "learning_rate": 7.588637073393935e-05, + "loss": 0.0803, + "step": 11200 + }, + { + "epoch": 0.6580955735587648, + "grad_norm": 1.1434074640274048, + "learning_rate": 7.58467963164731e-05, + "loss": 0.0604, + "step": 11210 + }, + { + "epoch": 0.6586826347305389, + "grad_norm": 3.274583578109741, + "learning_rate": 7.580719979014012e-05, + "loss": 0.0367, + "step": 11220 + }, + { + "epoch": 0.659269695902313, + "grad_norm": 2.5319130420684814, + "learning_rate": 7.576758118881056e-05, + "loss": 0.0811, + "step": 11230 + }, + { + "epoch": 0.6598567570740871, + "grad_norm": 3.002586603164673, + "learning_rate": 7.572794054637347e-05, + "loss": 0.0592, + "step": 11240 + }, + { + "epoch": 0.6604438182458612, + "grad_norm": 2.6163065433502197, + "learning_rate": 7.568827789673665e-05, + "loss": 0.046, + "step": 11250 + }, + { + "epoch": 0.6610308794176353, + "grad_norm": 3.5569369792938232, + "learning_rate": 7.564859327382685e-05, + "loss": 0.0442, + "step": 11260 + }, + { + "epoch": 0.6616179405894094, + "grad_norm": 2.348433017730713, + "learning_rate": 7.560888671158953e-05, + "loss": 0.0534, + "step": 11270 + }, + { + "epoch": 0.6622050017611835, + "grad_norm": 2.007448196411133, + "learning_rate": 7.556915824398894e-05, + "loss": 0.0336, + "step": 11280 + }, + { + "epoch": 0.6627920629329576, + "grad_norm": 1.9800559282302856, + "learning_rate": 7.552940790500806e-05, + "loss": 0.0425, + "step": 11290 + }, + { + "epoch": 0.6633791241047318, + "grad_norm": 1.264578104019165, + "learning_rate": 7.54896357286486e-05, + "loss": 0.0261, + "step": 11300 + }, + { + "epoch": 0.6639661852765059, + "grad_norm": 0.8466734290122986, + "learning_rate": 7.544984174893095e-05, + "loss": 0.04, + "step": 11310 + }, + { + "epoch": 0.66455324644828, + "grad_norm": 1.571954369544983, + "learning_rate": 7.54100259998941e-05, + "loss": 0.0585, + "step": 11320 + }, + { + "epoch": 0.665140307620054, + "grad_norm": 2.239985227584839, + "learning_rate": 7.537018851559576e-05, + "loss": 0.0498, + "step": 11330 + }, + { + "epoch": 0.6657273687918281, + "grad_norm": 1.2417446374893188, + "learning_rate": 7.533032933011209e-05, + "loss": 0.0339, + "step": 11340 + }, + { + "epoch": 0.6663144299636022, + "grad_norm": 2.6729910373687744, + "learning_rate": 7.529044847753795e-05, + "loss": 0.0284, + "step": 11350 + }, + { + "epoch": 0.6669014911353763, + "grad_norm": 2.4202144145965576, + "learning_rate": 7.525054599198666e-05, + "loss": 0.0509, + "step": 11360 + }, + { + "epoch": 0.6674885523071504, + "grad_norm": 1.1169862747192383, + "learning_rate": 7.521062190759005e-05, + "loss": 0.0324, + "step": 11370 + }, + { + "epoch": 0.6680756134789245, + "grad_norm": 1.7308237552642822, + "learning_rate": 7.517067625849846e-05, + "loss": 0.0264, + "step": 11380 + }, + { + "epoch": 0.6686626746506986, + "grad_norm": 3.76955509185791, + "learning_rate": 7.513070907888065e-05, + "loss": 0.0498, + "step": 11390 + }, + { + "epoch": 0.6692497358224727, + "grad_norm": 0.26449015736579895, + "learning_rate": 7.509072040292376e-05, + "loss": 0.0293, + "step": 11400 + }, + { + "epoch": 0.6698367969942468, + "grad_norm": 1.2897920608520508, + "learning_rate": 7.505071026483337e-05, + "loss": 0.0454, + "step": 11410 + }, + { + "epoch": 0.6704238581660209, + "grad_norm": 2.9363067150115967, + "learning_rate": 7.501067869883344e-05, + "loss": 0.0548, + "step": 11420 + }, + { + "epoch": 0.671010919337795, + "grad_norm": 3.353597640991211, + "learning_rate": 7.49706257391662e-05, + "loss": 0.041, + "step": 11430 + }, + { + "epoch": 0.6715979805095691, + "grad_norm": 0.747282087802887, + "learning_rate": 7.49305514200922e-05, + "loss": 0.0512, + "step": 11440 + }, + { + "epoch": 0.6721850416813432, + "grad_norm": 5.312231063842773, + "learning_rate": 7.489045577589026e-05, + "loss": 0.0655, + "step": 11450 + }, + { + "epoch": 0.6727721028531173, + "grad_norm": 1.659082293510437, + "learning_rate": 7.485033884085746e-05, + "loss": 0.042, + "step": 11460 + }, + { + "epoch": 0.6733591640248914, + "grad_norm": 2.6412506103515625, + "learning_rate": 7.481020064930908e-05, + "loss": 0.0392, + "step": 11470 + }, + { + "epoch": 0.6739462251966655, + "grad_norm": 1.4856112003326416, + "learning_rate": 7.477004123557855e-05, + "loss": 0.0382, + "step": 11480 + }, + { + "epoch": 0.6745332863684396, + "grad_norm": 1.4802236557006836, + "learning_rate": 7.472986063401751e-05, + "loss": 0.0414, + "step": 11490 + }, + { + "epoch": 0.6751203475402137, + "grad_norm": 2.9699227809906006, + "learning_rate": 7.46896588789957e-05, + "loss": 0.0329, + "step": 11500 + }, + { + "epoch": 0.6757074087119878, + "grad_norm": 1.5497137308120728, + "learning_rate": 7.464943600490094e-05, + "loss": 0.0432, + "step": 11510 + }, + { + "epoch": 0.6762944698837619, + "grad_norm": 0.049564044922590256, + "learning_rate": 7.46091920461391e-05, + "loss": 0.0538, + "step": 11520 + }, + { + "epoch": 0.676881531055536, + "grad_norm": 0.6027920246124268, + "learning_rate": 7.456892703713415e-05, + "loss": 0.0335, + "step": 11530 + }, + { + "epoch": 0.67746859222731, + "grad_norm": 2.308122158050537, + "learning_rate": 7.452864101232798e-05, + "loss": 0.0637, + "step": 11540 + }, + { + "epoch": 0.6780556533990841, + "grad_norm": 1.7618777751922607, + "learning_rate": 7.448833400618055e-05, + "loss": 0.0603, + "step": 11550 + }, + { + "epoch": 0.6786427145708582, + "grad_norm": 0.8958557844161987, + "learning_rate": 7.44480060531697e-05, + "loss": 0.0281, + "step": 11560 + }, + { + "epoch": 0.6792297757426324, + "grad_norm": 1.6657615900039673, + "learning_rate": 7.440765718779124e-05, + "loss": 0.0419, + "step": 11570 + }, + { + "epoch": 0.6798168369144065, + "grad_norm": 0.36698830127716064, + "learning_rate": 7.436728744455877e-05, + "loss": 0.0448, + "step": 11580 + }, + { + "epoch": 0.6804038980861806, + "grad_norm": 1.7312005758285522, + "learning_rate": 7.432689685800386e-05, + "loss": 0.0454, + "step": 11590 + }, + { + "epoch": 0.6809909592579547, + "grad_norm": 1.5802057981491089, + "learning_rate": 7.428648546267586e-05, + "loss": 0.0315, + "step": 11600 + }, + { + "epoch": 0.6815780204297288, + "grad_norm": 0.3204239308834076, + "learning_rate": 7.42460532931419e-05, + "loss": 0.0267, + "step": 11610 + }, + { + "epoch": 0.6821650816015029, + "grad_norm": 1.8985480070114136, + "learning_rate": 7.420560038398694e-05, + "loss": 0.0274, + "step": 11620 + }, + { + "epoch": 0.682752142773277, + "grad_norm": 2.6278510093688965, + "learning_rate": 7.416512676981359e-05, + "loss": 0.0369, + "step": 11630 + }, + { + "epoch": 0.6833392039450511, + "grad_norm": 4.143795490264893, + "learning_rate": 7.412463248524229e-05, + "loss": 0.0383, + "step": 11640 + }, + { + "epoch": 0.6839262651168252, + "grad_norm": 1.462797999382019, + "learning_rate": 7.408411756491104e-05, + "loss": 0.0269, + "step": 11650 + }, + { + "epoch": 0.6845133262885993, + "grad_norm": 3.339895725250244, + "learning_rate": 7.404358204347557e-05, + "loss": 0.0673, + "step": 11660 + }, + { + "epoch": 0.6851003874603734, + "grad_norm": 3.2953529357910156, + "learning_rate": 7.400302595560919e-05, + "loss": 0.0327, + "step": 11670 + }, + { + "epoch": 0.6856874486321475, + "grad_norm": 2.8305249214172363, + "learning_rate": 7.396244933600285e-05, + "loss": 0.0425, + "step": 11680 + }, + { + "epoch": 0.6862745098039216, + "grad_norm": 2.100759983062744, + "learning_rate": 7.3921852219365e-05, + "loss": 0.0421, + "step": 11690 + }, + { + "epoch": 0.6868615709756957, + "grad_norm": 0.9254026412963867, + "learning_rate": 7.388123464042167e-05, + "loss": 0.0458, + "step": 11700 + }, + { + "epoch": 0.6874486321474698, + "grad_norm": 2.050107955932617, + "learning_rate": 7.38405966339164e-05, + "loss": 0.036, + "step": 11710 + }, + { + "epoch": 0.6880356933192439, + "grad_norm": 3.631403923034668, + "learning_rate": 7.379993823461014e-05, + "loss": 0.052, + "step": 11720 + }, + { + "epoch": 0.688622754491018, + "grad_norm": 1.1665562391281128, + "learning_rate": 7.375925947728135e-05, + "loss": 0.043, + "step": 11730 + }, + { + "epoch": 0.689209815662792, + "grad_norm": 0.6579986214637756, + "learning_rate": 7.371856039672586e-05, + "loss": 0.0312, + "step": 11740 + }, + { + "epoch": 0.6897968768345661, + "grad_norm": 2.9885151386260986, + "learning_rate": 7.367784102775694e-05, + "loss": 0.0826, + "step": 11750 + }, + { + "epoch": 0.6903839380063402, + "grad_norm": 1.8481273651123047, + "learning_rate": 7.363710140520514e-05, + "loss": 0.024, + "step": 11760 + }, + { + "epoch": 0.6909709991781143, + "grad_norm": 2.1871728897094727, + "learning_rate": 7.35963415639184e-05, + "loss": 0.0322, + "step": 11770 + }, + { + "epoch": 0.6915580603498884, + "grad_norm": 3.232172727584839, + "learning_rate": 7.35555615387619e-05, + "loss": 0.0445, + "step": 11780 + }, + { + "epoch": 0.6921451215216625, + "grad_norm": 0.968634843826294, + "learning_rate": 7.351476136461814e-05, + "loss": 0.0682, + "step": 11790 + }, + { + "epoch": 0.6927321826934366, + "grad_norm": 0.8169238567352295, + "learning_rate": 7.34739410763868e-05, + "loss": 0.0445, + "step": 11800 + }, + { + "epoch": 0.6933192438652107, + "grad_norm": 1.6446173191070557, + "learning_rate": 7.34331007089848e-05, + "loss": 0.0387, + "step": 11810 + }, + { + "epoch": 0.6939063050369848, + "grad_norm": 1.1081836223602295, + "learning_rate": 7.339224029734623e-05, + "loss": 0.0327, + "step": 11820 + }, + { + "epoch": 0.6944933662087589, + "grad_norm": 2.7306418418884277, + "learning_rate": 7.335135987642233e-05, + "loss": 0.029, + "step": 11830 + }, + { + "epoch": 0.6950804273805331, + "grad_norm": 1.2643860578536987, + "learning_rate": 7.331045948118144e-05, + "loss": 0.0325, + "step": 11840 + }, + { + "epoch": 0.6956674885523072, + "grad_norm": 3.4605374336242676, + "learning_rate": 7.3269539146609e-05, + "loss": 0.0443, + "step": 11850 + }, + { + "epoch": 0.6962545497240813, + "grad_norm": 2.64870548248291, + "learning_rate": 7.32285989077075e-05, + "loss": 0.0234, + "step": 11860 + }, + { + "epoch": 0.6968416108958554, + "grad_norm": 1.5398218631744385, + "learning_rate": 7.318763879949644e-05, + "loss": 0.025, + "step": 11870 + }, + { + "epoch": 0.6974286720676295, + "grad_norm": 3.69907546043396, + "learning_rate": 7.314665885701234e-05, + "loss": 0.0346, + "step": 11880 + }, + { + "epoch": 0.6980157332394036, + "grad_norm": 1.5674400329589844, + "learning_rate": 7.310565911530869e-05, + "loss": 0.04, + "step": 11890 + }, + { + "epoch": 0.6986027944111777, + "grad_norm": 4.008990287780762, + "learning_rate": 7.30646396094559e-05, + "loss": 0.0437, + "step": 11900 + }, + { + "epoch": 0.6991898555829518, + "grad_norm": 1.03583824634552, + "learning_rate": 7.302360037454128e-05, + "loss": 0.0426, + "step": 11910 + }, + { + "epoch": 0.6997769167547259, + "grad_norm": 1.5053350925445557, + "learning_rate": 7.298254144566901e-05, + "loss": 0.0393, + "step": 11920 + }, + { + "epoch": 0.7003639779265, + "grad_norm": 2.7704758644104004, + "learning_rate": 7.294146285796015e-05, + "loss": 0.0271, + "step": 11930 + }, + { + "epoch": 0.700951039098274, + "grad_norm": 1.0304514169692993, + "learning_rate": 7.290036464655257e-05, + "loss": 0.0513, + "step": 11940 + }, + { + "epoch": 0.7015381002700481, + "grad_norm": 2.5655267238616943, + "learning_rate": 7.285924684660089e-05, + "loss": 0.032, + "step": 11950 + }, + { + "epoch": 0.7021251614418222, + "grad_norm": 1.0380624532699585, + "learning_rate": 7.281810949327651e-05, + "loss": 0.0326, + "step": 11960 + }, + { + "epoch": 0.7027122226135963, + "grad_norm": 3.0011343955993652, + "learning_rate": 7.277695262176756e-05, + "loss": 0.0214, + "step": 11970 + }, + { + "epoch": 0.7032992837853704, + "grad_norm": 0.4370494782924652, + "learning_rate": 7.273577626727884e-05, + "loss": 0.0373, + "step": 11980 + }, + { + "epoch": 0.7038863449571445, + "grad_norm": 3.3337936401367188, + "learning_rate": 7.269458046503187e-05, + "loss": 0.0315, + "step": 11990 + }, + { + "epoch": 0.7044734061289186, + "grad_norm": 3.230440378189087, + "learning_rate": 7.265336525026476e-05, + "loss": 0.03, + "step": 12000 + }, + { + "epoch": 0.7044734061289186, + "eval_loss": 0.44979673624038696, + "eval_runtime": 269.5683, + "eval_samples_per_second": 3.506, + "eval_steps_per_second": 3.506, + "step": 12000 + }, + { + "epoch": 0.7050604673006927, + "grad_norm": 1.5325340032577515, + "learning_rate": 7.26121306582322e-05, + "loss": 0.0634, + "step": 12010 + }, + { + "epoch": 0.7056475284724668, + "grad_norm": 0.4213601350784302, + "learning_rate": 7.257087672420553e-05, + "loss": 0.0311, + "step": 12020 + }, + { + "epoch": 0.7062345896442409, + "grad_norm": 1.2271337509155273, + "learning_rate": 7.252960348347258e-05, + "loss": 0.0705, + "step": 12030 + }, + { + "epoch": 0.706821650816015, + "grad_norm": 1.3212566375732422, + "learning_rate": 7.24883109713377e-05, + "loss": 0.0293, + "step": 12040 + }, + { + "epoch": 0.7074087119877891, + "grad_norm": 1.8340243101119995, + "learning_rate": 7.244699922312176e-05, + "loss": 0.0405, + "step": 12050 + }, + { + "epoch": 0.7079957731595632, + "grad_norm": 0.9552738666534424, + "learning_rate": 7.240566827416204e-05, + "loss": 0.0315, + "step": 12060 + }, + { + "epoch": 0.7085828343313373, + "grad_norm": 2.8367209434509277, + "learning_rate": 7.236431815981223e-05, + "loss": 0.061, + "step": 12070 + }, + { + "epoch": 0.7091698955031114, + "grad_norm": 4.737795352935791, + "learning_rate": 7.23229489154425e-05, + "loss": 0.0753, + "step": 12080 + }, + { + "epoch": 0.7097569566748855, + "grad_norm": 2.4975411891937256, + "learning_rate": 7.22815605764393e-05, + "loss": 0.0332, + "step": 12090 + }, + { + "epoch": 0.7103440178466596, + "grad_norm": 1.2418122291564941, + "learning_rate": 7.224015317820544e-05, + "loss": 0.0576, + "step": 12100 + }, + { + "epoch": 0.7109310790184338, + "grad_norm": 1.065011978149414, + "learning_rate": 7.219872675616006e-05, + "loss": 0.0344, + "step": 12110 + }, + { + "epoch": 0.7115181401902079, + "grad_norm": 4.0738067626953125, + "learning_rate": 7.215728134573852e-05, + "loss": 0.0333, + "step": 12120 + }, + { + "epoch": 0.712105201361982, + "grad_norm": 1.6276997327804565, + "learning_rate": 7.211581698239245e-05, + "loss": 0.0349, + "step": 12130 + }, + { + "epoch": 0.7126922625337561, + "grad_norm": 4.080755233764648, + "learning_rate": 7.207433370158972e-05, + "loss": 0.0611, + "step": 12140 + }, + { + "epoch": 0.7132793237055302, + "grad_norm": 1.5337743759155273, + "learning_rate": 7.203283153881432e-05, + "loss": 0.0603, + "step": 12150 + }, + { + "epoch": 0.7138663848773042, + "grad_norm": 5.253373146057129, + "learning_rate": 7.199131052956644e-05, + "loss": 0.055, + "step": 12160 + }, + { + "epoch": 0.7144534460490783, + "grad_norm": 1.5085833072662354, + "learning_rate": 7.194977070936239e-05, + "loss": 0.0312, + "step": 12170 + }, + { + "epoch": 0.7150405072208524, + "grad_norm": 0.32057294249534607, + "learning_rate": 7.190821211373453e-05, + "loss": 0.0339, + "step": 12180 + }, + { + "epoch": 0.7156275683926265, + "grad_norm": 1.0474879741668701, + "learning_rate": 7.18666347782313e-05, + "loss": 0.0597, + "step": 12190 + }, + { + "epoch": 0.7162146295644006, + "grad_norm": 2.558286666870117, + "learning_rate": 7.182503873841722e-05, + "loss": 0.0569, + "step": 12200 + }, + { + "epoch": 0.7168016907361747, + "grad_norm": 2.7508225440979004, + "learning_rate": 7.178342402987272e-05, + "loss": 0.0298, + "step": 12210 + }, + { + "epoch": 0.7173887519079488, + "grad_norm": 0.7895553112030029, + "learning_rate": 7.174179068819428e-05, + "loss": 0.0598, + "step": 12220 + }, + { + "epoch": 0.7179758130797229, + "grad_norm": 3.0784595012664795, + "learning_rate": 7.170013874899426e-05, + "loss": 0.05, + "step": 12230 + }, + { + "epoch": 0.718562874251497, + "grad_norm": 0.4165332317352295, + "learning_rate": 7.165846824790095e-05, + "loss": 0.0266, + "step": 12240 + }, + { + "epoch": 0.7191499354232711, + "grad_norm": 3.0013482570648193, + "learning_rate": 7.161677922055853e-05, + "loss": 0.0255, + "step": 12250 + }, + { + "epoch": 0.7197369965950452, + "grad_norm": 1.8503293991088867, + "learning_rate": 7.157507170262701e-05, + "loss": 0.0402, + "step": 12260 + }, + { + "epoch": 0.7203240577668193, + "grad_norm": 3.0309407711029053, + "learning_rate": 7.153334572978221e-05, + "loss": 0.0534, + "step": 12270 + }, + { + "epoch": 0.7209111189385934, + "grad_norm": 2.4800071716308594, + "learning_rate": 7.149160133771577e-05, + "loss": 0.0406, + "step": 12280 + }, + { + "epoch": 0.7214981801103675, + "grad_norm": 0.8500409126281738, + "learning_rate": 7.144983856213507e-05, + "loss": 0.0289, + "step": 12290 + }, + { + "epoch": 0.7220852412821416, + "grad_norm": 1.53009831905365, + "learning_rate": 7.140805743876317e-05, + "loss": 0.0235, + "step": 12300 + }, + { + "epoch": 0.7226723024539157, + "grad_norm": 2.7796554565429688, + "learning_rate": 7.136625800333887e-05, + "loss": 0.0256, + "step": 12310 + }, + { + "epoch": 0.7232593636256898, + "grad_norm": 1.4432597160339355, + "learning_rate": 7.132444029161667e-05, + "loss": 0.0232, + "step": 12320 + }, + { + "epoch": 0.7238464247974639, + "grad_norm": 1.8305975198745728, + "learning_rate": 7.12826043393666e-05, + "loss": 0.062, + "step": 12330 + }, + { + "epoch": 0.724433485969238, + "grad_norm": 1.6505351066589355, + "learning_rate": 7.12407501823744e-05, + "loss": 0.0345, + "step": 12340 + }, + { + "epoch": 0.725020547141012, + "grad_norm": 4.550604820251465, + "learning_rate": 7.11988778564413e-05, + "loss": 0.0348, + "step": 12350 + }, + { + "epoch": 0.7256076083127861, + "grad_norm": 0.8290696740150452, + "learning_rate": 7.115698739738412e-05, + "loss": 0.0171, + "step": 12360 + }, + { + "epoch": 0.7261946694845602, + "grad_norm": 2.516306161880493, + "learning_rate": 7.111507884103518e-05, + "loss": 0.0363, + "step": 12370 + }, + { + "epoch": 0.7267817306563343, + "grad_norm": 1.364721655845642, + "learning_rate": 7.107315222324227e-05, + "loss": 0.0453, + "step": 12380 + }, + { + "epoch": 0.7273687918281085, + "grad_norm": 1.1648774147033691, + "learning_rate": 7.103120757986864e-05, + "loss": 0.0161, + "step": 12390 + }, + { + "epoch": 0.7279558529998826, + "grad_norm": 2.6282575130462646, + "learning_rate": 7.098924494679295e-05, + "loss": 0.0609, + "step": 12400 + }, + { + "epoch": 0.7285429141716567, + "grad_norm": 1.9648375511169434, + "learning_rate": 7.094726435990926e-05, + "loss": 0.0375, + "step": 12410 + }, + { + "epoch": 0.7291299753434308, + "grad_norm": 1.3704135417938232, + "learning_rate": 7.090526585512696e-05, + "loss": 0.0608, + "step": 12420 + }, + { + "epoch": 0.7297170365152049, + "grad_norm": 0.3298032581806183, + "learning_rate": 7.086324946837081e-05, + "loss": 0.0334, + "step": 12430 + }, + { + "epoch": 0.730304097686979, + "grad_norm": 0.2856060564517975, + "learning_rate": 7.082121523558083e-05, + "loss": 0.0272, + "step": 12440 + }, + { + "epoch": 0.7308911588587531, + "grad_norm": 3.429799795150757, + "learning_rate": 7.077916319271232e-05, + "loss": 0.0704, + "step": 12450 + }, + { + "epoch": 0.7314782200305272, + "grad_norm": 1.299608826637268, + "learning_rate": 7.073709337573581e-05, + "loss": 0.0311, + "step": 12460 + }, + { + "epoch": 0.7320652812023013, + "grad_norm": 1.4074573516845703, + "learning_rate": 7.069500582063702e-05, + "loss": 0.0386, + "step": 12470 + }, + { + "epoch": 0.7326523423740754, + "grad_norm": 2.08955979347229, + "learning_rate": 7.06529005634169e-05, + "loss": 0.0289, + "step": 12480 + }, + { + "epoch": 0.7332394035458495, + "grad_norm": 3.107790946960449, + "learning_rate": 7.061077764009147e-05, + "loss": 0.0339, + "step": 12490 + }, + { + "epoch": 0.7338264647176236, + "grad_norm": 1.7810473442077637, + "learning_rate": 7.05686370866919e-05, + "loss": 0.0369, + "step": 12500 + }, + { + "epoch": 0.7344135258893977, + "grad_norm": 2.4785356521606445, + "learning_rate": 7.052647893926442e-05, + "loss": 0.0699, + "step": 12510 + }, + { + "epoch": 0.7350005870611718, + "grad_norm": 5.314114570617676, + "learning_rate": 7.048430323387034e-05, + "loss": 0.0359, + "step": 12520 + }, + { + "epoch": 0.7355876482329459, + "grad_norm": 1.8828675746917725, + "learning_rate": 7.044211000658595e-05, + "loss": 0.0446, + "step": 12530 + }, + { + "epoch": 0.73617470940472, + "grad_norm": 1.7500027418136597, + "learning_rate": 7.039989929350257e-05, + "loss": 0.0357, + "step": 12540 + }, + { + "epoch": 0.7367617705764941, + "grad_norm": 0.32012036442756653, + "learning_rate": 7.035767113072645e-05, + "loss": 0.022, + "step": 12550 + }, + { + "epoch": 0.7373488317482682, + "grad_norm": 3.6539933681488037, + "learning_rate": 7.031542555437876e-05, + "loss": 0.0401, + "step": 12560 + }, + { + "epoch": 0.7379358929200422, + "grad_norm": 2.1202316284179688, + "learning_rate": 7.027316260059558e-05, + "loss": 0.0385, + "step": 12570 + }, + { + "epoch": 0.7385229540918163, + "grad_norm": 1.3387161493301392, + "learning_rate": 7.023088230552787e-05, + "loss": 0.0468, + "step": 12580 + }, + { + "epoch": 0.7391100152635904, + "grad_norm": 1.752645492553711, + "learning_rate": 7.018858470534138e-05, + "loss": 0.0502, + "step": 12590 + }, + { + "epoch": 0.7396970764353645, + "grad_norm": 0.23137418925762177, + "learning_rate": 7.014626983621669e-05, + "loss": 0.0514, + "step": 12600 + }, + { + "epoch": 0.7402841376071386, + "grad_norm": 2.1703407764434814, + "learning_rate": 7.010393773434917e-05, + "loss": 0.0334, + "step": 12610 + }, + { + "epoch": 0.7408711987789127, + "grad_norm": 3.549792528152466, + "learning_rate": 7.006158843594887e-05, + "loss": 0.0662, + "step": 12620 + }, + { + "epoch": 0.7414582599506868, + "grad_norm": 2.0263309478759766, + "learning_rate": 7.001922197724063e-05, + "loss": 0.0223, + "step": 12630 + }, + { + "epoch": 0.7420453211224609, + "grad_norm": 2.800609588623047, + "learning_rate": 6.997683839446392e-05, + "loss": 0.0482, + "step": 12640 + }, + { + "epoch": 0.742632382294235, + "grad_norm": 2.8997108936309814, + "learning_rate": 6.993443772387284e-05, + "loss": 0.0386, + "step": 12650 + }, + { + "epoch": 0.7432194434660092, + "grad_norm": 6.5187153816223145, + "learning_rate": 6.989202000173614e-05, + "loss": 0.0499, + "step": 12660 + }, + { + "epoch": 0.7438065046377833, + "grad_norm": 2.201885223388672, + "learning_rate": 6.984958526433716e-05, + "loss": 0.0508, + "step": 12670 + }, + { + "epoch": 0.7443935658095574, + "grad_norm": 1.613524317741394, + "learning_rate": 6.980713354797376e-05, + "loss": 0.0248, + "step": 12680 + }, + { + "epoch": 0.7449806269813315, + "grad_norm": 0.9656441807746887, + "learning_rate": 6.97646648889584e-05, + "loss": 0.0284, + "step": 12690 + }, + { + "epoch": 0.7455676881531056, + "grad_norm": 0.754646897315979, + "learning_rate": 6.972217932361792e-05, + "loss": 0.0527, + "step": 12700 + }, + { + "epoch": 0.7461547493248797, + "grad_norm": 3.3350586891174316, + "learning_rate": 6.967967688829369e-05, + "loss": 0.0664, + "step": 12710 + }, + { + "epoch": 0.7467418104966538, + "grad_norm": 1.6783524751663208, + "learning_rate": 6.963715761934151e-05, + "loss": 0.0622, + "step": 12720 + }, + { + "epoch": 0.7473288716684279, + "grad_norm": 1.228952407836914, + "learning_rate": 6.959462155313155e-05, + "loss": 0.0325, + "step": 12730 + }, + { + "epoch": 0.747915932840202, + "grad_norm": 0.7153427004814148, + "learning_rate": 6.955206872604839e-05, + "loss": 0.0251, + "step": 12740 + }, + { + "epoch": 0.7485029940119761, + "grad_norm": 0.10894730687141418, + "learning_rate": 6.950949917449093e-05, + "loss": 0.04, + "step": 12750 + }, + { + "epoch": 0.7490900551837502, + "grad_norm": 1.343906044960022, + "learning_rate": 6.946691293487233e-05, + "loss": 0.0341, + "step": 12760 + }, + { + "epoch": 0.7496771163555243, + "grad_norm": 1.4371585845947266, + "learning_rate": 6.94243100436201e-05, + "loss": 0.0346, + "step": 12770 + }, + { + "epoch": 0.7502641775272983, + "grad_norm": 1.6158758401870728, + "learning_rate": 6.938169053717593e-05, + "loss": 0.0272, + "step": 12780 + }, + { + "epoch": 0.7508512386990724, + "grad_norm": 2.5721545219421387, + "learning_rate": 6.933905445199578e-05, + "loss": 0.055, + "step": 12790 + }, + { + "epoch": 0.7514382998708465, + "grad_norm": 1.3039095401763916, + "learning_rate": 6.929640182454973e-05, + "loss": 0.0466, + "step": 12800 + }, + { + "epoch": 0.7520253610426206, + "grad_norm": 1.7179279327392578, + "learning_rate": 6.925373269132207e-05, + "loss": 0.0486, + "step": 12810 + }, + { + "epoch": 0.7526124222143947, + "grad_norm": 2.3523173332214355, + "learning_rate": 6.921104708881115e-05, + "loss": 0.023, + "step": 12820 + }, + { + "epoch": 0.7531994833861688, + "grad_norm": 1.761407732963562, + "learning_rate": 6.916834505352945e-05, + "loss": 0.055, + "step": 12830 + }, + { + "epoch": 0.7537865445579429, + "grad_norm": 3.2720420360565186, + "learning_rate": 6.91256266220035e-05, + "loss": 0.0669, + "step": 12840 + }, + { + "epoch": 0.754373605729717, + "grad_norm": 1.0557044744491577, + "learning_rate": 6.908289183077385e-05, + "loss": 0.0441, + "step": 12850 + }, + { + "epoch": 0.7549606669014911, + "grad_norm": 2.7804226875305176, + "learning_rate": 6.904014071639503e-05, + "loss": 0.033, + "step": 12860 + }, + { + "epoch": 0.7555477280732652, + "grad_norm": 1.608482837677002, + "learning_rate": 6.899737331543555e-05, + "loss": 0.0374, + "step": 12870 + }, + { + "epoch": 0.7561347892450393, + "grad_norm": 3.039224624633789, + "learning_rate": 6.895458966447784e-05, + "loss": 0.0286, + "step": 12880 + }, + { + "epoch": 0.7567218504168134, + "grad_norm": 0.5459122657775879, + "learning_rate": 6.891178980011826e-05, + "loss": 0.0312, + "step": 12890 + }, + { + "epoch": 0.7573089115885875, + "grad_norm": 1.7158164978027344, + "learning_rate": 6.886897375896697e-05, + "loss": 0.045, + "step": 12900 + }, + { + "epoch": 0.7578959727603616, + "grad_norm": 1.8184658288955688, + "learning_rate": 6.882614157764804e-05, + "loss": 0.0411, + "step": 12910 + }, + { + "epoch": 0.7584830339321357, + "grad_norm": 0.8906010985374451, + "learning_rate": 6.878329329279933e-05, + "loss": 0.0486, + "step": 12920 + }, + { + "epoch": 0.7590700951039099, + "grad_norm": 1.9838579893112183, + "learning_rate": 6.874042894107245e-05, + "loss": 0.0673, + "step": 12930 + }, + { + "epoch": 0.759657156275684, + "grad_norm": 0.6923233270645142, + "learning_rate": 6.869754855913273e-05, + "loss": 0.0211, + "step": 12940 + }, + { + "epoch": 0.7602442174474581, + "grad_norm": 2.112013578414917, + "learning_rate": 6.86546521836593e-05, + "loss": 0.0315, + "step": 12950 + }, + { + "epoch": 0.7608312786192322, + "grad_norm": 1.9117103815078735, + "learning_rate": 6.86117398513449e-05, + "loss": 0.0326, + "step": 12960 + }, + { + "epoch": 0.7614183397910063, + "grad_norm": 2.3542299270629883, + "learning_rate": 6.856881159889593e-05, + "loss": 0.0689, + "step": 12970 + }, + { + "epoch": 0.7620054009627804, + "grad_norm": 3.010636806488037, + "learning_rate": 6.852586746303243e-05, + "loss": 0.0496, + "step": 12980 + }, + { + "epoch": 0.7625924621345544, + "grad_norm": 5.607231140136719, + "learning_rate": 6.848290748048801e-05, + "loss": 0.0321, + "step": 12990 + }, + { + "epoch": 0.7631795233063285, + "grad_norm": 0.8411577343940735, + "learning_rate": 6.843993168800982e-05, + "loss": 0.0493, + "step": 13000 + }, + { + "epoch": 0.7637665844781026, + "grad_norm": 3.7046091556549072, + "learning_rate": 6.839694012235856e-05, + "loss": 0.0399, + "step": 13010 + }, + { + "epoch": 0.7643536456498767, + "grad_norm": 1.6446720361709595, + "learning_rate": 6.835393282030841e-05, + "loss": 0.0472, + "step": 13020 + }, + { + "epoch": 0.7649407068216508, + "grad_norm": 1.5894074440002441, + "learning_rate": 6.8310909818647e-05, + "loss": 0.0415, + "step": 13030 + }, + { + "epoch": 0.7655277679934249, + "grad_norm": 1.3392444849014282, + "learning_rate": 6.826787115417544e-05, + "loss": 0.0532, + "step": 13040 + }, + { + "epoch": 0.766114829165199, + "grad_norm": 2.6800129413604736, + "learning_rate": 6.822481686370815e-05, + "loss": 0.0462, + "step": 13050 + }, + { + "epoch": 0.7667018903369731, + "grad_norm": 1.4358599185943604, + "learning_rate": 6.818174698407302e-05, + "loss": 0.05, + "step": 13060 + }, + { + "epoch": 0.7672889515087472, + "grad_norm": 1.4766582250595093, + "learning_rate": 6.813866155211118e-05, + "loss": 0.0421, + "step": 13070 + }, + { + "epoch": 0.7678760126805213, + "grad_norm": 1.8147733211517334, + "learning_rate": 6.80955606046771e-05, + "loss": 0.0508, + "step": 13080 + }, + { + "epoch": 0.7684630738522954, + "grad_norm": 1.277007818222046, + "learning_rate": 6.805244417863854e-05, + "loss": 0.0344, + "step": 13090 + }, + { + "epoch": 0.7690501350240695, + "grad_norm": 1.3057037591934204, + "learning_rate": 6.80093123108765e-05, + "loss": 0.0383, + "step": 13100 + }, + { + "epoch": 0.7696371961958436, + "grad_norm": 1.6316132545471191, + "learning_rate": 6.796616503828515e-05, + "loss": 0.0333, + "step": 13110 + }, + { + "epoch": 0.7702242573676177, + "grad_norm": 3.621436595916748, + "learning_rate": 6.79230023977719e-05, + "loss": 0.0521, + "step": 13120 + }, + { + "epoch": 0.7708113185393918, + "grad_norm": 0.9022402167320251, + "learning_rate": 6.787982442625721e-05, + "loss": 0.0237, + "step": 13130 + }, + { + "epoch": 0.7713983797111659, + "grad_norm": 1.7554409503936768, + "learning_rate": 6.783663116067473e-05, + "loss": 0.0364, + "step": 13140 + }, + { + "epoch": 0.77198544088294, + "grad_norm": 4.405052661895752, + "learning_rate": 6.779342263797119e-05, + "loss": 0.0514, + "step": 13150 + }, + { + "epoch": 0.7725725020547141, + "grad_norm": 3.3128652572631836, + "learning_rate": 6.775019889510635e-05, + "loss": 0.0453, + "step": 13160 + }, + { + "epoch": 0.7731595632264882, + "grad_norm": 0.12712036073207855, + "learning_rate": 6.770695996905297e-05, + "loss": 0.0302, + "step": 13170 + }, + { + "epoch": 0.7737466243982623, + "grad_norm": 2.2140140533447266, + "learning_rate": 6.766370589679685e-05, + "loss": 0.0427, + "step": 13180 + }, + { + "epoch": 0.7743336855700363, + "grad_norm": 0.7173200845718384, + "learning_rate": 6.762043671533668e-05, + "loss": 0.0274, + "step": 13190 + }, + { + "epoch": 0.7749207467418106, + "grad_norm": 5.099856853485107, + "learning_rate": 6.757715246168414e-05, + "loss": 0.0405, + "step": 13200 + }, + { + "epoch": 0.7755078079135846, + "grad_norm": 0.9245136380195618, + "learning_rate": 6.753385317286377e-05, + "loss": 0.0514, + "step": 13210 + }, + { + "epoch": 0.7760948690853587, + "grad_norm": 2.752617597579956, + "learning_rate": 6.749053888591295e-05, + "loss": 0.0358, + "step": 13220 + }, + { + "epoch": 0.7766819302571328, + "grad_norm": 1.602553367614746, + "learning_rate": 6.744720963788193e-05, + "loss": 0.0397, + "step": 13230 + }, + { + "epoch": 0.7772689914289069, + "grad_norm": 2.3663432598114014, + "learning_rate": 6.740386546583373e-05, + "loss": 0.0375, + "step": 13240 + }, + { + "epoch": 0.777856052600681, + "grad_norm": 0.5158788561820984, + "learning_rate": 6.736050640684416e-05, + "loss": 0.0359, + "step": 13250 + }, + { + "epoch": 0.7784431137724551, + "grad_norm": 1.9949820041656494, + "learning_rate": 6.731713249800173e-05, + "loss": 0.0454, + "step": 13260 + }, + { + "epoch": 0.7790301749442292, + "grad_norm": 2.5277843475341797, + "learning_rate": 6.727374377640768e-05, + "loss": 0.0381, + "step": 13270 + }, + { + "epoch": 0.7796172361160033, + "grad_norm": 1.7521203756332397, + "learning_rate": 6.723034027917592e-05, + "loss": 0.0267, + "step": 13280 + }, + { + "epoch": 0.7802042972877774, + "grad_norm": 0.489425390958786, + "learning_rate": 6.718692204343298e-05, + "loss": 0.0451, + "step": 13290 + }, + { + "epoch": 0.7807913584595515, + "grad_norm": 1.8275169134140015, + "learning_rate": 6.7143489106318e-05, + "loss": 0.0503, + "step": 13300 + }, + { + "epoch": 0.7813784196313256, + "grad_norm": 4.13484001159668, + "learning_rate": 6.710004150498271e-05, + "loss": 0.0627, + "step": 13310 + }, + { + "epoch": 0.7819654808030997, + "grad_norm": 2.0780868530273438, + "learning_rate": 6.70565792765914e-05, + "loss": 0.0244, + "step": 13320 + }, + { + "epoch": 0.7825525419748738, + "grad_norm": 2.7539408206939697, + "learning_rate": 6.701310245832082e-05, + "loss": 0.0354, + "step": 13330 + }, + { + "epoch": 0.7831396031466479, + "grad_norm": 1.69619619846344, + "learning_rate": 6.696961108736024e-05, + "loss": 0.0504, + "step": 13340 + }, + { + "epoch": 0.783726664318422, + "grad_norm": 1.7762585878372192, + "learning_rate": 6.692610520091137e-05, + "loss": 0.0439, + "step": 13350 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 0.22033926844596863, + "learning_rate": 6.68825848361883e-05, + "loss": 0.0481, + "step": 13360 + }, + { + "epoch": 0.7849007866619702, + "grad_norm": 0.562747597694397, + "learning_rate": 6.683905003041757e-05, + "loss": 0.0161, + "step": 13370 + }, + { + "epoch": 0.7854878478337443, + "grad_norm": 4.002884387969971, + "learning_rate": 6.679550082083803e-05, + "loss": 0.0419, + "step": 13380 + }, + { + "epoch": 0.7860749090055184, + "grad_norm": 2.3743350505828857, + "learning_rate": 6.675193724470087e-05, + "loss": 0.0293, + "step": 13390 + }, + { + "epoch": 0.7866619701772924, + "grad_norm": 1.1087560653686523, + "learning_rate": 6.670835933926955e-05, + "loss": 0.0451, + "step": 13400 + }, + { + "epoch": 0.7872490313490665, + "grad_norm": 1.0285279750823975, + "learning_rate": 6.666476714181979e-05, + "loss": 0.0304, + "step": 13410 + }, + { + "epoch": 0.7878360925208406, + "grad_norm": 0.6177504658699036, + "learning_rate": 6.662116068963954e-05, + "loss": 0.0242, + "step": 13420 + }, + { + "epoch": 0.7884231536926147, + "grad_norm": 2.7186331748962402, + "learning_rate": 6.657754002002898e-05, + "loss": 0.0391, + "step": 13430 + }, + { + "epoch": 0.7890102148643888, + "grad_norm": 1.8687515258789062, + "learning_rate": 6.653390517030038e-05, + "loss": 0.0358, + "step": 13440 + }, + { + "epoch": 0.7895972760361629, + "grad_norm": 0.6915972232818604, + "learning_rate": 6.649025617777818e-05, + "loss": 0.0238, + "step": 13450 + }, + { + "epoch": 0.790184337207937, + "grad_norm": 1.9850742816925049, + "learning_rate": 6.64465930797989e-05, + "loss": 0.0592, + "step": 13460 + }, + { + "epoch": 0.7907713983797112, + "grad_norm": 2.312723159790039, + "learning_rate": 6.640291591371117e-05, + "loss": 0.037, + "step": 13470 + }, + { + "epoch": 0.7913584595514853, + "grad_norm": 1.8763644695281982, + "learning_rate": 6.635922471687561e-05, + "loss": 0.034, + "step": 13480 + }, + { + "epoch": 0.7919455207232594, + "grad_norm": 0.05788072198629379, + "learning_rate": 6.631551952666484e-05, + "loss": 0.0253, + "step": 13490 + }, + { + "epoch": 0.7925325818950335, + "grad_norm": 2.2860443592071533, + "learning_rate": 6.627180038046347e-05, + "loss": 0.0405, + "step": 13500 + }, + { + "epoch": 0.7931196430668076, + "grad_norm": 1.2735824584960938, + "learning_rate": 6.622806731566807e-05, + "loss": 0.0343, + "step": 13510 + }, + { + "epoch": 0.7937067042385817, + "grad_norm": 2.2331037521362305, + "learning_rate": 6.618432036968705e-05, + "loss": 0.0216, + "step": 13520 + }, + { + "epoch": 0.7942937654103558, + "grad_norm": 2.2226316928863525, + "learning_rate": 6.614055957994075e-05, + "loss": 0.0382, + "step": 13530 + }, + { + "epoch": 0.7948808265821299, + "grad_norm": 2.079427480697632, + "learning_rate": 6.60967849838613e-05, + "loss": 0.0363, + "step": 13540 + }, + { + "epoch": 0.795467887753904, + "grad_norm": 2.138134241104126, + "learning_rate": 6.60529966188927e-05, + "loss": 0.028, + "step": 13550 + }, + { + "epoch": 0.7960549489256781, + "grad_norm": 0.8080530166625977, + "learning_rate": 6.60091945224907e-05, + "loss": 0.0157, + "step": 13560 + }, + { + "epoch": 0.7966420100974522, + "grad_norm": 2.0846288204193115, + "learning_rate": 6.596537873212281e-05, + "loss": 0.041, + "step": 13570 + }, + { + "epoch": 0.7972290712692263, + "grad_norm": 2.216050148010254, + "learning_rate": 6.592154928526818e-05, + "loss": 0.0312, + "step": 13580 + }, + { + "epoch": 0.7978161324410004, + "grad_norm": 1.5155831575393677, + "learning_rate": 6.587770621941776e-05, + "loss": 0.052, + "step": 13590 + }, + { + "epoch": 0.7984031936127745, + "grad_norm": 2.175265312194824, + "learning_rate": 6.583384957207406e-05, + "loss": 0.0444, + "step": 13600 + }, + { + "epoch": 0.7989902547845485, + "grad_norm": 2.381720542907715, + "learning_rate": 6.578997938075125e-05, + "loss": 0.0364, + "step": 13610 + }, + { + "epoch": 0.7995773159563226, + "grad_norm": 1.3376045227050781, + "learning_rate": 6.574609568297507e-05, + "loss": 0.0365, + "step": 13620 + }, + { + "epoch": 0.8001643771280967, + "grad_norm": 3.4189717769622803, + "learning_rate": 6.57021985162828e-05, + "loss": 0.0478, + "step": 13630 + }, + { + "epoch": 0.8007514382998708, + "grad_norm": 0.809241771697998, + "learning_rate": 6.565828791822327e-05, + "loss": 0.0358, + "step": 13640 + }, + { + "epoch": 0.8013384994716449, + "grad_norm": 1.9690725803375244, + "learning_rate": 6.56143639263568e-05, + "loss": 0.0489, + "step": 13650 + }, + { + "epoch": 0.801925560643419, + "grad_norm": 3.3723227977752686, + "learning_rate": 6.557042657825511e-05, + "loss": 0.052, + "step": 13660 + }, + { + "epoch": 0.8025126218151931, + "grad_norm": 2.127072811126709, + "learning_rate": 6.552647591150143e-05, + "loss": 0.0386, + "step": 13670 + }, + { + "epoch": 0.8030996829869672, + "grad_norm": 1.5079299211502075, + "learning_rate": 6.548251196369031e-05, + "loss": 0.0251, + "step": 13680 + }, + { + "epoch": 0.8036867441587413, + "grad_norm": 0.4091831147670746, + "learning_rate": 6.54385347724277e-05, + "loss": 0.021, + "step": 13690 + }, + { + "epoch": 0.8042738053305154, + "grad_norm": 2.712334156036377, + "learning_rate": 6.539454437533088e-05, + "loss": 0.0523, + "step": 13700 + }, + { + "epoch": 0.8048608665022895, + "grad_norm": 3.098822593688965, + "learning_rate": 6.535054081002841e-05, + "loss": 0.045, + "step": 13710 + }, + { + "epoch": 0.8054479276740636, + "grad_norm": 2.6743454933166504, + "learning_rate": 6.530652411416007e-05, + "loss": 0.056, + "step": 13720 + }, + { + "epoch": 0.8060349888458377, + "grad_norm": 0.3644692003726959, + "learning_rate": 6.5262494325377e-05, + "loss": 0.0292, + "step": 13730 + }, + { + "epoch": 0.8066220500176118, + "grad_norm": 2.4092676639556885, + "learning_rate": 6.52184514813414e-05, + "loss": 0.0463, + "step": 13740 + }, + { + "epoch": 0.807209111189386, + "grad_norm": 0.7954262495040894, + "learning_rate": 6.517439561972671e-05, + "loss": 0.0514, + "step": 13750 + }, + { + "epoch": 0.8077961723611601, + "grad_norm": 1.604623794555664, + "learning_rate": 6.513032677821752e-05, + "loss": 0.0261, + "step": 13760 + }, + { + "epoch": 0.8083832335329342, + "grad_norm": 2.0205390453338623, + "learning_rate": 6.508624499450944e-05, + "loss": 0.0497, + "step": 13770 + }, + { + "epoch": 0.8089702947047083, + "grad_norm": 1.3266966342926025, + "learning_rate": 6.504215030630925e-05, + "loss": 0.0378, + "step": 13780 + }, + { + "epoch": 0.8095573558764824, + "grad_norm": 0.645121157169342, + "learning_rate": 6.49980427513347e-05, + "loss": 0.026, + "step": 13790 + }, + { + "epoch": 0.8101444170482565, + "grad_norm": 3.148008346557617, + "learning_rate": 6.495392236731458e-05, + "loss": 0.0506, + "step": 13800 + }, + { + "epoch": 0.8107314782200306, + "grad_norm": 1.275623083114624, + "learning_rate": 6.490978919198863e-05, + "loss": 0.0382, + "step": 13810 + }, + { + "epoch": 0.8113185393918046, + "grad_norm": 0.44733619689941406, + "learning_rate": 6.486564326310754e-05, + "loss": 0.0347, + "step": 13820 + }, + { + "epoch": 0.8119056005635787, + "grad_norm": 1.5372415781021118, + "learning_rate": 6.482148461843294e-05, + "loss": 0.031, + "step": 13830 + }, + { + "epoch": 0.8124926617353528, + "grad_norm": 3.134258508682251, + "learning_rate": 6.477731329573729e-05, + "loss": 0.0398, + "step": 13840 + }, + { + "epoch": 0.8130797229071269, + "grad_norm": 3.234987735748291, + "learning_rate": 6.473312933280391e-05, + "loss": 0.0726, + "step": 13850 + }, + { + "epoch": 0.813666784078901, + "grad_norm": 1.5991384983062744, + "learning_rate": 6.468893276742695e-05, + "loss": 0.0229, + "step": 13860 + }, + { + "epoch": 0.8142538452506751, + "grad_norm": 1.283582329750061, + "learning_rate": 6.464472363741132e-05, + "loss": 0.0415, + "step": 13870 + }, + { + "epoch": 0.8148409064224492, + "grad_norm": 1.0967763662338257, + "learning_rate": 6.460050198057268e-05, + "loss": 0.0353, + "step": 13880 + }, + { + "epoch": 0.8154279675942233, + "grad_norm": 1.3352218866348267, + "learning_rate": 6.45562678347374e-05, + "loss": 0.0305, + "step": 13890 + }, + { + "epoch": 0.8160150287659974, + "grad_norm": 0.7509410977363586, + "learning_rate": 6.451202123774258e-05, + "loss": 0.0478, + "step": 13900 + }, + { + "epoch": 0.8166020899377715, + "grad_norm": 1.6695328950881958, + "learning_rate": 6.446776222743589e-05, + "loss": 0.0248, + "step": 13910 + }, + { + "epoch": 0.8171891511095456, + "grad_norm": 0.5678431987762451, + "learning_rate": 6.442349084167568e-05, + "loss": 0.0352, + "step": 13920 + }, + { + "epoch": 0.8177762122813197, + "grad_norm": 2.2780115604400635, + "learning_rate": 6.437920711833086e-05, + "loss": 0.0507, + "step": 13930 + }, + { + "epoch": 0.8183632734530938, + "grad_norm": 1.653802752494812, + "learning_rate": 6.433491109528091e-05, + "loss": 0.0339, + "step": 13940 + }, + { + "epoch": 0.8189503346248679, + "grad_norm": 3.931762218475342, + "learning_rate": 6.429060281041581e-05, + "loss": 0.0672, + "step": 13950 + }, + { + "epoch": 0.819537395796642, + "grad_norm": 1.88666832447052, + "learning_rate": 6.424628230163606e-05, + "loss": 0.0328, + "step": 13960 + }, + { + "epoch": 0.8201244569684161, + "grad_norm": 2.9051437377929688, + "learning_rate": 6.420194960685255e-05, + "loss": 0.0275, + "step": 13970 + }, + { + "epoch": 0.8207115181401902, + "grad_norm": 2.4517440795898438, + "learning_rate": 6.41576047639867e-05, + "loss": 0.0327, + "step": 13980 + }, + { + "epoch": 0.8212985793119643, + "grad_norm": 1.2846941947937012, + "learning_rate": 6.41132478109702e-05, + "loss": 0.0511, + "step": 13990 + }, + { + "epoch": 0.8218856404837384, + "grad_norm": 0.5321528911590576, + "learning_rate": 6.406887878574519e-05, + "loss": 0.0709, + "step": 14000 + }, + { + "epoch": 0.8224727016555125, + "grad_norm": 1.4567052125930786, + "learning_rate": 6.402449772626412e-05, + "loss": 0.0367, + "step": 14010 + }, + { + "epoch": 0.8230597628272867, + "grad_norm": 0.714896023273468, + "learning_rate": 6.398010467048968e-05, + "loss": 0.0317, + "step": 14020 + }, + { + "epoch": 0.8236468239990608, + "grad_norm": 0.9767372608184814, + "learning_rate": 6.39356996563949e-05, + "loss": 0.0279, + "step": 14030 + }, + { + "epoch": 0.8242338851708348, + "grad_norm": 2.217620611190796, + "learning_rate": 6.389128272196296e-05, + "loss": 0.0564, + "step": 14040 + }, + { + "epoch": 0.8248209463426089, + "grad_norm": 0.9955437779426575, + "learning_rate": 6.38468539051873e-05, + "loss": 0.0323, + "step": 14050 + }, + { + "epoch": 0.825408007514383, + "grad_norm": 2.635800838470459, + "learning_rate": 6.38024132440715e-05, + "loss": 0.057, + "step": 14060 + }, + { + "epoch": 0.8259950686861571, + "grad_norm": 1.7019506692886353, + "learning_rate": 6.375796077662928e-05, + "loss": 0.0404, + "step": 14070 + }, + { + "epoch": 0.8265821298579312, + "grad_norm": 0.9067420959472656, + "learning_rate": 6.371349654088442e-05, + "loss": 0.031, + "step": 14080 + }, + { + "epoch": 0.8271691910297053, + "grad_norm": 0.594925582408905, + "learning_rate": 6.366902057487083e-05, + "loss": 0.0822, + "step": 14090 + }, + { + "epoch": 0.8277562522014794, + "grad_norm": 2.47068452835083, + "learning_rate": 6.36245329166324e-05, + "loss": 0.0256, + "step": 14100 + }, + { + "epoch": 0.8283433133732535, + "grad_norm": 2.7607803344726562, + "learning_rate": 6.358003360422304e-05, + "loss": 0.0359, + "step": 14110 + }, + { + "epoch": 0.8289303745450276, + "grad_norm": 1.8484448194503784, + "learning_rate": 6.353552267570666e-05, + "loss": 0.0425, + "step": 14120 + }, + { + "epoch": 0.8295174357168017, + "grad_norm": 3.1313066482543945, + "learning_rate": 6.349100016915703e-05, + "loss": 0.0467, + "step": 14130 + }, + { + "epoch": 0.8301044968885758, + "grad_norm": 2.5662596225738525, + "learning_rate": 6.34464661226579e-05, + "loss": 0.0448, + "step": 14140 + }, + { + "epoch": 0.8306915580603499, + "grad_norm": 0.4227585792541504, + "learning_rate": 6.340192057430286e-05, + "loss": 0.034, + "step": 14150 + }, + { + "epoch": 0.831278619232124, + "grad_norm": 0.47378337383270264, + "learning_rate": 6.335736356219533e-05, + "loss": 0.0234, + "step": 14160 + }, + { + "epoch": 0.8318656804038981, + "grad_norm": 2.1253409385681152, + "learning_rate": 6.331279512444855e-05, + "loss": 0.0378, + "step": 14170 + }, + { + "epoch": 0.8324527415756722, + "grad_norm": 2.14623761177063, + "learning_rate": 6.326821529918553e-05, + "loss": 0.0536, + "step": 14180 + }, + { + "epoch": 0.8330398027474463, + "grad_norm": 1.6848366260528564, + "learning_rate": 6.322362412453903e-05, + "loss": 0.041, + "step": 14190 + }, + { + "epoch": 0.8336268639192204, + "grad_norm": 3.853180408477783, + "learning_rate": 6.31790216386515e-05, + "loss": 0.0439, + "step": 14200 + }, + { + "epoch": 0.8342139250909945, + "grad_norm": 1.1675387620925903, + "learning_rate": 6.313440787967506e-05, + "loss": 0.0475, + "step": 14210 + }, + { + "epoch": 0.8348009862627686, + "grad_norm": 2.579005718231201, + "learning_rate": 6.30897828857715e-05, + "loss": 0.052, + "step": 14220 + }, + { + "epoch": 0.8353880474345426, + "grad_norm": 0.5900012850761414, + "learning_rate": 6.30451466951122e-05, + "loss": 0.0197, + "step": 14230 + }, + { + "epoch": 0.8359751086063167, + "grad_norm": 2.4965133666992188, + "learning_rate": 6.300049934587812e-05, + "loss": 0.0379, + "step": 14240 + }, + { + "epoch": 0.8365621697780908, + "grad_norm": 2.155104637145996, + "learning_rate": 6.295584087625979e-05, + "loss": 0.0317, + "step": 14250 + }, + { + "epoch": 0.8371492309498649, + "grad_norm": 0.8006505966186523, + "learning_rate": 6.291117132445722e-05, + "loss": 0.033, + "step": 14260 + }, + { + "epoch": 0.837736292121639, + "grad_norm": 3.3270461559295654, + "learning_rate": 6.286649072867988e-05, + "loss": 0.047, + "step": 14270 + }, + { + "epoch": 0.8383233532934131, + "grad_norm": 3.7353594303131104, + "learning_rate": 6.282179912714677e-05, + "loss": 0.0266, + "step": 14280 + }, + { + "epoch": 0.8389104144651873, + "grad_norm": 2.0424246788024902, + "learning_rate": 6.277709655808622e-05, + "loss": 0.0282, + "step": 14290 + }, + { + "epoch": 0.8394974756369614, + "grad_norm": 2.0882630348205566, + "learning_rate": 6.273238305973596e-05, + "loss": 0.0401, + "step": 14300 + }, + { + "epoch": 0.8400845368087355, + "grad_norm": 1.401748776435852, + "learning_rate": 6.268765867034311e-05, + "loss": 0.0357, + "step": 14310 + }, + { + "epoch": 0.8406715979805096, + "grad_norm": 0.7414683103561401, + "learning_rate": 6.264292342816407e-05, + "loss": 0.0339, + "step": 14320 + }, + { + "epoch": 0.8412586591522837, + "grad_norm": 0.1633201390504837, + "learning_rate": 6.25981773714645e-05, + "loss": 0.04, + "step": 14330 + }, + { + "epoch": 0.8418457203240578, + "grad_norm": 1.5481064319610596, + "learning_rate": 6.255342053851938e-05, + "loss": 0.0412, + "step": 14340 + }, + { + "epoch": 0.8424327814958319, + "grad_norm": 1.6022753715515137, + "learning_rate": 6.250865296761286e-05, + "loss": 0.029, + "step": 14350 + }, + { + "epoch": 0.843019842667606, + "grad_norm": 2.521550416946411, + "learning_rate": 6.246387469703826e-05, + "loss": 0.0211, + "step": 14360 + }, + { + "epoch": 0.8436069038393801, + "grad_norm": 4.940478801727295, + "learning_rate": 6.241908576509812e-05, + "loss": 0.0318, + "step": 14370 + }, + { + "epoch": 0.8441939650111542, + "grad_norm": 2.262380838394165, + "learning_rate": 6.237428621010402e-05, + "loss": 0.0287, + "step": 14380 + }, + { + "epoch": 0.8447810261829283, + "grad_norm": 0.9917835593223572, + "learning_rate": 6.232947607037666e-05, + "loss": 0.0334, + "step": 14390 + }, + { + "epoch": 0.8453680873547024, + "grad_norm": 2.2069954872131348, + "learning_rate": 6.228465538424583e-05, + "loss": 0.0256, + "step": 14400 + }, + { + "epoch": 0.8459551485264765, + "grad_norm": 3.83937406539917, + "learning_rate": 6.223982419005027e-05, + "loss": 0.034, + "step": 14410 + }, + { + "epoch": 0.8465422096982506, + "grad_norm": 3.3192458152770996, + "learning_rate": 6.219498252613777e-05, + "loss": 0.0314, + "step": 14420 + }, + { + "epoch": 0.8471292708700247, + "grad_norm": 2.14279842376709, + "learning_rate": 6.215013043086504e-05, + "loss": 0.0436, + "step": 14430 + }, + { + "epoch": 0.8477163320417987, + "grad_norm": 1.3427451848983765, + "learning_rate": 6.210526794259772e-05, + "loss": 0.0212, + "step": 14440 + }, + { + "epoch": 0.8483033932135728, + "grad_norm": 6.0217790603637695, + "learning_rate": 6.206039509971038e-05, + "loss": 0.0338, + "step": 14450 + }, + { + "epoch": 0.8488904543853469, + "grad_norm": 4.078960418701172, + "learning_rate": 6.201551194058637e-05, + "loss": 0.042, + "step": 14460 + }, + { + "epoch": 0.849477515557121, + "grad_norm": 0.48227864503860474, + "learning_rate": 6.19706185036179e-05, + "loss": 0.0181, + "step": 14470 + }, + { + "epoch": 0.8500645767288951, + "grad_norm": 1.5753501653671265, + "learning_rate": 6.192571482720601e-05, + "loss": 0.0308, + "step": 14480 + }, + { + "epoch": 0.8506516379006692, + "grad_norm": 4.5797438621521, + "learning_rate": 6.188080094976046e-05, + "loss": 0.0435, + "step": 14490 + }, + { + "epoch": 0.8512386990724433, + "grad_norm": 0.8167878985404968, + "learning_rate": 6.183587690969974e-05, + "loss": 0.0209, + "step": 14500 + }, + { + "epoch": 0.8518257602442174, + "grad_norm": 0.5837923288345337, + "learning_rate": 6.179094274545102e-05, + "loss": 0.0251, + "step": 14510 + }, + { + "epoch": 0.8524128214159915, + "grad_norm": 1.6540395021438599, + "learning_rate": 6.174599849545015e-05, + "loss": 0.0329, + "step": 14520 + }, + { + "epoch": 0.8529998825877656, + "grad_norm": 3.803781747817993, + "learning_rate": 6.170104419814162e-05, + "loss": 0.0556, + "step": 14530 + }, + { + "epoch": 0.8535869437595397, + "grad_norm": 2.439143657684326, + "learning_rate": 6.165607989197847e-05, + "loss": 0.0221, + "step": 14540 + }, + { + "epoch": 0.8541740049313138, + "grad_norm": 2.360718011856079, + "learning_rate": 6.161110561542235e-05, + "loss": 0.03, + "step": 14550 + }, + { + "epoch": 0.854761066103088, + "grad_norm": 4.080716133117676, + "learning_rate": 6.156612140694339e-05, + "loss": 0.059, + "step": 14560 + }, + { + "epoch": 0.8553481272748621, + "grad_norm": 1.6676440238952637, + "learning_rate": 6.152112730502027e-05, + "loss": 0.0214, + "step": 14570 + }, + { + "epoch": 0.8559351884466362, + "grad_norm": 1.4322593212127686, + "learning_rate": 6.147612334814008e-05, + "loss": 0.06, + "step": 14580 + }, + { + "epoch": 0.8565222496184103, + "grad_norm": 2.004303455352783, + "learning_rate": 6.143110957479839e-05, + "loss": 0.0544, + "step": 14590 + }, + { + "epoch": 0.8571093107901844, + "grad_norm": 1.1425613164901733, + "learning_rate": 6.138608602349911e-05, + "loss": 0.0443, + "step": 14600 + }, + { + "epoch": 0.8576963719619585, + "grad_norm": 2.01226544380188, + "learning_rate": 6.134105273275457e-05, + "loss": 0.0273, + "step": 14610 + }, + { + "epoch": 0.8582834331337326, + "grad_norm": 2.711271286010742, + "learning_rate": 6.129600974108538e-05, + "loss": 0.0527, + "step": 14620 + }, + { + "epoch": 0.8588704943055067, + "grad_norm": 2.209378719329834, + "learning_rate": 6.125095708702052e-05, + "loss": 0.0438, + "step": 14630 + }, + { + "epoch": 0.8594575554772808, + "grad_norm": 0.835532546043396, + "learning_rate": 6.120589480909715e-05, + "loss": 0.0151, + "step": 14640 + }, + { + "epoch": 0.8600446166490548, + "grad_norm": 2.318243980407715, + "learning_rate": 6.116082294586068e-05, + "loss": 0.0286, + "step": 14650 + }, + { + "epoch": 0.8606316778208289, + "grad_norm": 3.605604648590088, + "learning_rate": 6.11157415358648e-05, + "loss": 0.0468, + "step": 14660 + }, + { + "epoch": 0.861218738992603, + "grad_norm": 0.7488554120063782, + "learning_rate": 6.107065061767127e-05, + "loss": 0.0264, + "step": 14670 + }, + { + "epoch": 0.8618058001643771, + "grad_norm": 6.506105899810791, + "learning_rate": 6.1025550229850004e-05, + "loss": 0.037, + "step": 14680 + }, + { + "epoch": 0.8623928613361512, + "grad_norm": 4.626204490661621, + "learning_rate": 6.098044041097907e-05, + "loss": 0.039, + "step": 14690 + }, + { + "epoch": 0.8629799225079253, + "grad_norm": 2.7244555950164795, + "learning_rate": 6.0935321199644544e-05, + "loss": 0.0304, + "step": 14700 + }, + { + "epoch": 0.8635669836796994, + "grad_norm": 2.2430191040039062, + "learning_rate": 6.0890192634440546e-05, + "loss": 0.0363, + "step": 14710 + }, + { + "epoch": 0.8641540448514735, + "grad_norm": 2.716965913772583, + "learning_rate": 6.084505475396923e-05, + "loss": 0.0303, + "step": 14720 + }, + { + "epoch": 0.8647411060232476, + "grad_norm": 1.0491752624511719, + "learning_rate": 6.079990759684068e-05, + "loss": 0.0364, + "step": 14730 + }, + { + "epoch": 0.8653281671950217, + "grad_norm": 1.1998114585876465, + "learning_rate": 6.075475120167293e-05, + "loss": 0.0413, + "step": 14740 + }, + { + "epoch": 0.8659152283667958, + "grad_norm": 1.2094826698303223, + "learning_rate": 6.070958560709194e-05, + "loss": 0.041, + "step": 14750 + }, + { + "epoch": 0.8665022895385699, + "grad_norm": 3.386530637741089, + "learning_rate": 6.066441085173149e-05, + "loss": 0.0251, + "step": 14760 + }, + { + "epoch": 0.867089350710344, + "grad_norm": 1.256798505783081, + "learning_rate": 6.061922697423322e-05, + "loss": 0.0336, + "step": 14770 + }, + { + "epoch": 0.8676764118821181, + "grad_norm": 1.8139320611953735, + "learning_rate": 6.057403401324659e-05, + "loss": 0.0225, + "step": 14780 + }, + { + "epoch": 0.8682634730538922, + "grad_norm": 1.8259650468826294, + "learning_rate": 6.052883200742883e-05, + "loss": 0.0399, + "step": 14790 + }, + { + "epoch": 0.8688505342256663, + "grad_norm": 2.8148298263549805, + "learning_rate": 6.0483620995444835e-05, + "loss": 0.0458, + "step": 14800 + }, + { + "epoch": 0.8694375953974404, + "grad_norm": 2.914865493774414, + "learning_rate": 6.043840101596731e-05, + "loss": 0.0484, + "step": 14810 + }, + { + "epoch": 0.8700246565692145, + "grad_norm": 0.6122520565986633, + "learning_rate": 6.0393172107676576e-05, + "loss": 0.0472, + "step": 14820 + }, + { + "epoch": 0.8706117177409887, + "grad_norm": 0.9464847445487976, + "learning_rate": 6.034793430926058e-05, + "loss": 0.0557, + "step": 14830 + }, + { + "epoch": 0.8711987789127628, + "grad_norm": 2.066427707672119, + "learning_rate": 6.0302687659414904e-05, + "loss": 0.0414, + "step": 14840 + }, + { + "epoch": 0.8717858400845369, + "grad_norm": 1.8651105165481567, + "learning_rate": 6.025743219684267e-05, + "loss": 0.0374, + "step": 14850 + }, + { + "epoch": 0.872372901256311, + "grad_norm": 0.6826112270355225, + "learning_rate": 6.021216796025456e-05, + "loss": 0.0384, + "step": 14860 + }, + { + "epoch": 0.872959962428085, + "grad_norm": 2.0334994792938232, + "learning_rate": 6.016689498836877e-05, + "loss": 0.026, + "step": 14870 + }, + { + "epoch": 0.8735470235998591, + "grad_norm": 1.9022555351257324, + "learning_rate": 6.012161331991093e-05, + "loss": 0.0486, + "step": 14880 + }, + { + "epoch": 0.8741340847716332, + "grad_norm": 0.45631638169288635, + "learning_rate": 6.007632299361417e-05, + "loss": 0.037, + "step": 14890 + }, + { + "epoch": 0.8747211459434073, + "grad_norm": 2.617318630218506, + "learning_rate": 6.003102404821895e-05, + "loss": 0.0273, + "step": 14900 + }, + { + "epoch": 0.8753082071151814, + "grad_norm": 2.712918519973755, + "learning_rate": 5.9985716522473166e-05, + "loss": 0.0337, + "step": 14910 + }, + { + "epoch": 0.8758952682869555, + "grad_norm": 0.9799612164497375, + "learning_rate": 5.9940400455132025e-05, + "loss": 0.0613, + "step": 14920 + }, + { + "epoch": 0.8764823294587296, + "grad_norm": 2.20308780670166, + "learning_rate": 5.989507588495804e-05, + "loss": 0.043, + "step": 14930 + }, + { + "epoch": 0.8770693906305037, + "grad_norm": 2.622640371322632, + "learning_rate": 5.984974285072099e-05, + "loss": 0.0376, + "step": 14940 + }, + { + "epoch": 0.8776564518022778, + "grad_norm": 2.3511288166046143, + "learning_rate": 5.980440139119794e-05, + "loss": 0.0397, + "step": 14950 + }, + { + "epoch": 0.8782435129740519, + "grad_norm": 1.1025187969207764, + "learning_rate": 5.975905154517309e-05, + "loss": 0.0589, + "step": 14960 + }, + { + "epoch": 0.878830574145826, + "grad_norm": 1.562609314918518, + "learning_rate": 5.971369335143787e-05, + "loss": 0.0357, + "step": 14970 + }, + { + "epoch": 0.8794176353176001, + "grad_norm": 1.2567226886749268, + "learning_rate": 5.966832684879084e-05, + "loss": 0.0828, + "step": 14980 + }, + { + "epoch": 0.8800046964893742, + "grad_norm": 3.298917055130005, + "learning_rate": 5.962295207603764e-05, + "loss": 0.0225, + "step": 14990 + }, + { + "epoch": 0.8805917576611483, + "grad_norm": 3.532365560531616, + "learning_rate": 5.9577569071991e-05, + "loss": 0.0286, + "step": 15000 + }, + { + "epoch": 0.8805917576611483, + "eval_loss": 0.45234647393226624, + "eval_runtime": 269.6106, + "eval_samples_per_second": 3.505, + "eval_steps_per_second": 3.505, + "step": 15000 + }, + { + "epoch": 0.8811788188329224, + "grad_norm": 1.6025224924087524, + "learning_rate": 5.953217787547072e-05, + "loss": 0.0241, + "step": 15010 + }, + { + "epoch": 0.8817658800046965, + "grad_norm": 1.0338939428329468, + "learning_rate": 5.9486778525303556e-05, + "loss": 0.0221, + "step": 15020 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 2.463163137435913, + "learning_rate": 5.944137106032327e-05, + "loss": 0.0434, + "step": 15030 + }, + { + "epoch": 0.8829400023482447, + "grad_norm": 2.0624144077301025, + "learning_rate": 5.939595551937057e-05, + "loss": 0.0414, + "step": 15040 + }, + { + "epoch": 0.8835270635200188, + "grad_norm": 2.9850263595581055, + "learning_rate": 5.9350531941293056e-05, + "loss": 0.0569, + "step": 15050 + }, + { + "epoch": 0.8841141246917928, + "grad_norm": 1.0960736274719238, + "learning_rate": 5.93051003649452e-05, + "loss": 0.0256, + "step": 15060 + }, + { + "epoch": 0.8847011858635669, + "grad_norm": 5.331958293914795, + "learning_rate": 5.9259660829188337e-05, + "loss": 0.0284, + "step": 15070 + }, + { + "epoch": 0.885288247035341, + "grad_norm": 0.8948186039924622, + "learning_rate": 5.921421337289059e-05, + "loss": 0.0318, + "step": 15080 + }, + { + "epoch": 0.8858753082071151, + "grad_norm": 1.8375474214553833, + "learning_rate": 5.9168758034926855e-05, + "loss": 0.0333, + "step": 15090 + }, + { + "epoch": 0.8864623693788892, + "grad_norm": 1.2407913208007812, + "learning_rate": 5.912329485417879e-05, + "loss": 0.0458, + "step": 15100 + }, + { + "epoch": 0.8870494305506634, + "grad_norm": 2.3570520877838135, + "learning_rate": 5.9077823869534745e-05, + "loss": 0.0365, + "step": 15110 + }, + { + "epoch": 0.8876364917224375, + "grad_norm": 2.152998447418213, + "learning_rate": 5.9032345119889765e-05, + "loss": 0.0382, + "step": 15120 + }, + { + "epoch": 0.8882235528942116, + "grad_norm": 3.138258218765259, + "learning_rate": 5.8986858644145505e-05, + "loss": 0.0367, + "step": 15130 + }, + { + "epoch": 0.8888106140659857, + "grad_norm": 2.184736967086792, + "learning_rate": 5.8941364481210245e-05, + "loss": 0.0264, + "step": 15140 + }, + { + "epoch": 0.8893976752377598, + "grad_norm": 3.6055047512054443, + "learning_rate": 5.889586266999887e-05, + "loss": 0.0331, + "step": 15150 + }, + { + "epoch": 0.8899847364095339, + "grad_norm": 4.376768112182617, + "learning_rate": 5.8850353249432744e-05, + "loss": 0.0703, + "step": 15160 + }, + { + "epoch": 0.890571797581308, + "grad_norm": 1.2885960340499878, + "learning_rate": 5.880483625843979e-05, + "loss": 0.0532, + "step": 15170 + }, + { + "epoch": 0.8911588587530821, + "grad_norm": 0.7660097479820251, + "learning_rate": 5.8759311735954404e-05, + "loss": 0.0279, + "step": 15180 + }, + { + "epoch": 0.8917459199248562, + "grad_norm": 0.7138040661811829, + "learning_rate": 5.8713779720917395e-05, + "loss": 0.0542, + "step": 15190 + }, + { + "epoch": 0.8923329810966303, + "grad_norm": 0.8894451856613159, + "learning_rate": 5.8668240252275995e-05, + "loss": 0.0277, + "step": 15200 + }, + { + "epoch": 0.8929200422684044, + "grad_norm": 2.522827625274658, + "learning_rate": 5.8622693368983847e-05, + "loss": 0.053, + "step": 15210 + }, + { + "epoch": 0.8935071034401785, + "grad_norm": 1.9021809101104736, + "learning_rate": 5.857713911000087e-05, + "loss": 0.0349, + "step": 15220 + }, + { + "epoch": 0.8940941646119526, + "grad_norm": 0.3319094181060791, + "learning_rate": 5.8531577514293324e-05, + "loss": 0.0354, + "step": 15230 + }, + { + "epoch": 0.8946812257837267, + "grad_norm": 2.2337393760681152, + "learning_rate": 5.848600862083378e-05, + "loss": 0.042, + "step": 15240 + }, + { + "epoch": 0.8952682869555008, + "grad_norm": 2.5856029987335205, + "learning_rate": 5.844043246860098e-05, + "loss": 0.0369, + "step": 15250 + }, + { + "epoch": 0.8958553481272749, + "grad_norm": 1.4416180849075317, + "learning_rate": 5.839484909657993e-05, + "loss": 0.0401, + "step": 15260 + }, + { + "epoch": 0.896442409299049, + "grad_norm": 2.852252721786499, + "learning_rate": 5.834925854376181e-05, + "loss": 0.0373, + "step": 15270 + }, + { + "epoch": 0.897029470470823, + "grad_norm": 1.1797451972961426, + "learning_rate": 5.83036608491439e-05, + "loss": 0.0283, + "step": 15280 + }, + { + "epoch": 0.8976165316425971, + "grad_norm": 2.4308016300201416, + "learning_rate": 5.8258056051729634e-05, + "loss": 0.0159, + "step": 15290 + }, + { + "epoch": 0.8982035928143712, + "grad_norm": 1.619468331336975, + "learning_rate": 5.821244419052849e-05, + "loss": 0.0385, + "step": 15300 + }, + { + "epoch": 0.8987906539861453, + "grad_norm": 1.6315438747406006, + "learning_rate": 5.816682530455602e-05, + "loss": 0.0422, + "step": 15310 + }, + { + "epoch": 0.8993777151579194, + "grad_norm": 1.3823318481445312, + "learning_rate": 5.8121199432833754e-05, + "loss": 0.0248, + "step": 15320 + }, + { + "epoch": 0.8999647763296935, + "grad_norm": 1.8854079246520996, + "learning_rate": 5.807556661438922e-05, + "loss": 0.028, + "step": 15330 + }, + { + "epoch": 0.9005518375014676, + "grad_norm": 3.2190797328948975, + "learning_rate": 5.802992688825587e-05, + "loss": 0.0582, + "step": 15340 + }, + { + "epoch": 0.9011388986732417, + "grad_norm": 2.959430694580078, + "learning_rate": 5.798428029347306e-05, + "loss": 0.0194, + "step": 15350 + }, + { + "epoch": 0.9017259598450158, + "grad_norm": 1.8747074604034424, + "learning_rate": 5.7938626869086066e-05, + "loss": 0.0285, + "step": 15360 + }, + { + "epoch": 0.9023130210167899, + "grad_norm": 1.2437909841537476, + "learning_rate": 5.7892966654145944e-05, + "loss": 0.0316, + "step": 15370 + }, + { + "epoch": 0.9029000821885641, + "grad_norm": 2.640223264694214, + "learning_rate": 5.784729968770961e-05, + "loss": 0.0302, + "step": 15380 + }, + { + "epoch": 0.9034871433603382, + "grad_norm": 1.4964799880981445, + "learning_rate": 5.780162600883974e-05, + "loss": 0.0348, + "step": 15390 + }, + { + "epoch": 0.9040742045321123, + "grad_norm": 1.8708909749984741, + "learning_rate": 5.775594565660472e-05, + "loss": 0.0275, + "step": 15400 + }, + { + "epoch": 0.9046612657038864, + "grad_norm": 1.6548596620559692, + "learning_rate": 5.771025867007868e-05, + "loss": 0.0307, + "step": 15410 + }, + { + "epoch": 0.9052483268756605, + "grad_norm": 1.6874966621398926, + "learning_rate": 5.766456508834142e-05, + "loss": 0.0279, + "step": 15420 + }, + { + "epoch": 0.9058353880474346, + "grad_norm": 2.4529452323913574, + "learning_rate": 5.761886495047837e-05, + "loss": 0.0344, + "step": 15430 + }, + { + "epoch": 0.9064224492192087, + "grad_norm": 1.2588340044021606, + "learning_rate": 5.757315829558057e-05, + "loss": 0.0392, + "step": 15440 + }, + { + "epoch": 0.9070095103909828, + "grad_norm": 0.4524109661579132, + "learning_rate": 5.752744516274465e-05, + "loss": 0.0399, + "step": 15450 + }, + { + "epoch": 0.9075965715627569, + "grad_norm": 0.8751776814460754, + "learning_rate": 5.748172559107277e-05, + "loss": 0.044, + "step": 15460 + }, + { + "epoch": 0.908183632734531, + "grad_norm": 1.0942742824554443, + "learning_rate": 5.7435999619672595e-05, + "loss": 0.0424, + "step": 15470 + }, + { + "epoch": 0.908770693906305, + "grad_norm": 2.3219096660614014, + "learning_rate": 5.739026728765726e-05, + "loss": 0.0278, + "step": 15480 + }, + { + "epoch": 0.9093577550780791, + "grad_norm": 1.6318626403808594, + "learning_rate": 5.7344528634145354e-05, + "loss": 0.0322, + "step": 15490 + }, + { + "epoch": 0.9099448162498532, + "grad_norm": 3.0432188510894775, + "learning_rate": 5.7298783698260874e-05, + "loss": 0.0259, + "step": 15500 + }, + { + "epoch": 0.9105318774216273, + "grad_norm": 2.1943719387054443, + "learning_rate": 5.725303251913317e-05, + "loss": 0.0435, + "step": 15510 + }, + { + "epoch": 0.9111189385934014, + "grad_norm": 4.016179084777832, + "learning_rate": 5.7207275135896945e-05, + "loss": 0.0267, + "step": 15520 + }, + { + "epoch": 0.9117059997651755, + "grad_norm": 1.7722045183181763, + "learning_rate": 5.7161511587692216e-05, + "loss": 0.0351, + "step": 15530 + }, + { + "epoch": 0.9122930609369496, + "grad_norm": 3.096362590789795, + "learning_rate": 5.7115741913664264e-05, + "loss": 0.0359, + "step": 15540 + }, + { + "epoch": 0.9128801221087237, + "grad_norm": 7.351997375488281, + "learning_rate": 5.7069966152963614e-05, + "loss": 0.0431, + "step": 15550 + }, + { + "epoch": 0.9134671832804978, + "grad_norm": 2.606060743331909, + "learning_rate": 5.702418434474601e-05, + "loss": 0.0344, + "step": 15560 + }, + { + "epoch": 0.9140542444522719, + "grad_norm": 0.09482274204492569, + "learning_rate": 5.6978396528172326e-05, + "loss": 0.0246, + "step": 15570 + }, + { + "epoch": 0.914641305624046, + "grad_norm": 2.3057878017425537, + "learning_rate": 5.693260274240863e-05, + "loss": 0.0504, + "step": 15580 + }, + { + "epoch": 0.9152283667958201, + "grad_norm": 1.0935826301574707, + "learning_rate": 5.688680302662607e-05, + "loss": 0.0288, + "step": 15590 + }, + { + "epoch": 0.9158154279675942, + "grad_norm": 2.7622785568237305, + "learning_rate": 5.6840997420000865e-05, + "loss": 0.0312, + "step": 15600 + }, + { + "epoch": 0.9164024891393683, + "grad_norm": 4.465653896331787, + "learning_rate": 5.679518596171425e-05, + "loss": 0.0343, + "step": 15610 + }, + { + "epoch": 0.9169895503111424, + "grad_norm": 3.447021007537842, + "learning_rate": 5.674936869095252e-05, + "loss": 0.0465, + "step": 15620 + }, + { + "epoch": 0.9175766114829165, + "grad_norm": 3.203032970428467, + "learning_rate": 5.670354564690692e-05, + "loss": 0.0325, + "step": 15630 + }, + { + "epoch": 0.9181636726546906, + "grad_norm": 1.981807827949524, + "learning_rate": 5.665771686877358e-05, + "loss": 0.0538, + "step": 15640 + }, + { + "epoch": 0.9187507338264648, + "grad_norm": 1.0509763956069946, + "learning_rate": 5.661188239575364e-05, + "loss": 0.0291, + "step": 15650 + }, + { + "epoch": 0.9193377949982389, + "grad_norm": 3.080498218536377, + "learning_rate": 5.6566042267052997e-05, + "loss": 0.0178, + "step": 15660 + }, + { + "epoch": 0.919924856170013, + "grad_norm": 1.2986247539520264, + "learning_rate": 5.6520196521882475e-05, + "loss": 0.0472, + "step": 15670 + }, + { + "epoch": 0.9205119173417871, + "grad_norm": 1.8869589567184448, + "learning_rate": 5.647434519945767e-05, + "loss": 0.0501, + "step": 15680 + }, + { + "epoch": 0.9210989785135612, + "grad_norm": 1.710890531539917, + "learning_rate": 5.642848833899891e-05, + "loss": 0.0526, + "step": 15690 + }, + { + "epoch": 0.9216860396853352, + "grad_norm": 1.9326368570327759, + "learning_rate": 5.638262597973134e-05, + "loss": 0.0386, + "step": 15700 + }, + { + "epoch": 0.9222731008571093, + "grad_norm": 3.3880348205566406, + "learning_rate": 5.633675816088475e-05, + "loss": 0.0213, + "step": 15710 + }, + { + "epoch": 0.9228601620288834, + "grad_norm": 0.6809608936309814, + "learning_rate": 5.62908849216936e-05, + "loss": 0.0335, + "step": 15720 + }, + { + "epoch": 0.9234472232006575, + "grad_norm": 1.8033440113067627, + "learning_rate": 5.624500630139702e-05, + "loss": 0.0289, + "step": 15730 + }, + { + "epoch": 0.9240342843724316, + "grad_norm": 3.8078174591064453, + "learning_rate": 5.619912233923872e-05, + "loss": 0.032, + "step": 15740 + }, + { + "epoch": 0.9246213455442057, + "grad_norm": 0.1132720559835434, + "learning_rate": 5.615323307446697e-05, + "loss": 0.0487, + "step": 15750 + }, + { + "epoch": 0.9252084067159798, + "grad_norm": 2.2745766639709473, + "learning_rate": 5.610733854633462e-05, + "loss": 0.0627, + "step": 15760 + }, + { + "epoch": 0.9257954678877539, + "grad_norm": 1.6652629375457764, + "learning_rate": 5.6061438794098974e-05, + "loss": 0.0248, + "step": 15770 + }, + { + "epoch": 0.926382529059528, + "grad_norm": 1.28446626663208, + "learning_rate": 5.601553385702182e-05, + "loss": 0.0255, + "step": 15780 + }, + { + "epoch": 0.9269695902313021, + "grad_norm": 2.2817840576171875, + "learning_rate": 5.5969623774369396e-05, + "loss": 0.0362, + "step": 15790 + }, + { + "epoch": 0.9275566514030762, + "grad_norm": 1.6001989841461182, + "learning_rate": 5.592370858541232e-05, + "loss": 0.0257, + "step": 15800 + }, + { + "epoch": 0.9281437125748503, + "grad_norm": 1.3455231189727783, + "learning_rate": 5.587778832942556e-05, + "loss": 0.0362, + "step": 15810 + }, + { + "epoch": 0.9287307737466244, + "grad_norm": 1.5758661031723022, + "learning_rate": 5.583186304568849e-05, + "loss": 0.0451, + "step": 15820 + }, + { + "epoch": 0.9293178349183985, + "grad_norm": 4.79602575302124, + "learning_rate": 5.578593277348473e-05, + "loss": 0.0241, + "step": 15830 + }, + { + "epoch": 0.9299048960901726, + "grad_norm": 2.943692207336426, + "learning_rate": 5.573999755210215e-05, + "loss": 0.0197, + "step": 15840 + }, + { + "epoch": 0.9304919572619467, + "grad_norm": 1.6917659044265747, + "learning_rate": 5.56940574208329e-05, + "loss": 0.0289, + "step": 15850 + }, + { + "epoch": 0.9310790184337208, + "grad_norm": 3.5683882236480713, + "learning_rate": 5.564811241897333e-05, + "loss": 0.0483, + "step": 15860 + }, + { + "epoch": 0.9316660796054949, + "grad_norm": 1.0227922201156616, + "learning_rate": 5.5602162585823894e-05, + "loss": 0.0286, + "step": 15870 + }, + { + "epoch": 0.932253140777269, + "grad_norm": 1.409133791923523, + "learning_rate": 5.555620796068925e-05, + "loss": 0.038, + "step": 15880 + }, + { + "epoch": 0.932840201949043, + "grad_norm": 2.680568218231201, + "learning_rate": 5.551024858287812e-05, + "loss": 0.0354, + "step": 15890 + }, + { + "epoch": 0.9334272631208171, + "grad_norm": 2.6215765476226807, + "learning_rate": 5.546428449170329e-05, + "loss": 0.0361, + "step": 15900 + }, + { + "epoch": 0.9340143242925912, + "grad_norm": 1.0609687566757202, + "learning_rate": 5.54183157264816e-05, + "loss": 0.027, + "step": 15910 + }, + { + "epoch": 0.9346013854643654, + "grad_norm": 2.0819435119628906, + "learning_rate": 5.537234232653386e-05, + "loss": 0.0165, + "step": 15920 + }, + { + "epoch": 0.9351884466361395, + "grad_norm": 1.355362057685852, + "learning_rate": 5.532636433118484e-05, + "loss": 0.0381, + "step": 15930 + }, + { + "epoch": 0.9357755078079136, + "grad_norm": 1.5677322149276733, + "learning_rate": 5.52803817797633e-05, + "loss": 0.027, + "step": 15940 + }, + { + "epoch": 0.9363625689796877, + "grad_norm": 2.1124112606048584, + "learning_rate": 5.523439471160181e-05, + "loss": 0.0269, + "step": 15950 + }, + { + "epoch": 0.9369496301514618, + "grad_norm": 1.3802322149276733, + "learning_rate": 5.518840316603689e-05, + "loss": 0.0253, + "step": 15960 + }, + { + "epoch": 0.9375366913232359, + "grad_norm": 1.363916039466858, + "learning_rate": 5.514240718240884e-05, + "loss": 0.0335, + "step": 15970 + }, + { + "epoch": 0.93812375249501, + "grad_norm": 1.7724204063415527, + "learning_rate": 5.509640680006175e-05, + "loss": 0.049, + "step": 15980 + }, + { + "epoch": 0.9387108136667841, + "grad_norm": 1.3622511625289917, + "learning_rate": 5.5050402058343476e-05, + "loss": 0.0259, + "step": 15990 + }, + { + "epoch": 0.9392978748385582, + "grad_norm": 1.2243056297302246, + "learning_rate": 5.500439299660566e-05, + "loss": 0.0325, + "step": 16000 + }, + { + "epoch": 0.9398849360103323, + "grad_norm": 0.9930700063705444, + "learning_rate": 5.495837965420356e-05, + "loss": 0.0272, + "step": 16010 + }, + { + "epoch": 0.9404719971821064, + "grad_norm": 0.009694311767816544, + "learning_rate": 5.491236207049613e-05, + "loss": 0.0179, + "step": 16020 + }, + { + "epoch": 0.9410590583538805, + "grad_norm": 3.36891770362854, + "learning_rate": 5.4866340284845955e-05, + "loss": 0.0151, + "step": 16030 + }, + { + "epoch": 0.9416461195256546, + "grad_norm": 2.4878575801849365, + "learning_rate": 5.4820314336619214e-05, + "loss": 0.0406, + "step": 16040 + }, + { + "epoch": 0.9422331806974287, + "grad_norm": 1.6236851215362549, + "learning_rate": 5.477428426518565e-05, + "loss": 0.0439, + "step": 16050 + }, + { + "epoch": 0.9428202418692028, + "grad_norm": 2.8964767456054688, + "learning_rate": 5.472825010991852e-05, + "loss": 0.0363, + "step": 16060 + }, + { + "epoch": 0.9434073030409769, + "grad_norm": 1.8873333930969238, + "learning_rate": 5.468221191019457e-05, + "loss": 0.0245, + "step": 16070 + }, + { + "epoch": 0.943994364212751, + "grad_norm": 3.159353017807007, + "learning_rate": 5.463616970539403e-05, + "loss": 0.0413, + "step": 16080 + }, + { + "epoch": 0.944581425384525, + "grad_norm": 2.711113452911377, + "learning_rate": 5.459012353490054e-05, + "loss": 0.0408, + "step": 16090 + }, + { + "epoch": 0.9451684865562991, + "grad_norm": 2.389240264892578, + "learning_rate": 5.454407343810112e-05, + "loss": 0.0256, + "step": 16100 + }, + { + "epoch": 0.9457555477280732, + "grad_norm": 0.7224491238594055, + "learning_rate": 5.449801945438619e-05, + "loss": 0.026, + "step": 16110 + }, + { + "epoch": 0.9463426088998473, + "grad_norm": 0.4595985412597656, + "learning_rate": 5.445196162314944e-05, + "loss": 0.0868, + "step": 16120 + }, + { + "epoch": 0.9469296700716214, + "grad_norm": 2.019042730331421, + "learning_rate": 5.440589998378788e-05, + "loss": 0.0402, + "step": 16130 + }, + { + "epoch": 0.9475167312433955, + "grad_norm": 0.10347907990217209, + "learning_rate": 5.435983457570179e-05, + "loss": 0.0232, + "step": 16140 + }, + { + "epoch": 0.9481037924151696, + "grad_norm": 1.186431646347046, + "learning_rate": 5.431376543829467e-05, + "loss": 0.0155, + "step": 16150 + }, + { + "epoch": 0.9486908535869437, + "grad_norm": 3.5603644847869873, + "learning_rate": 5.426769261097317e-05, + "loss": 0.0635, + "step": 16160 + }, + { + "epoch": 0.9492779147587178, + "grad_norm": 1.3630090951919556, + "learning_rate": 5.422161613314715e-05, + "loss": 0.0289, + "step": 16170 + }, + { + "epoch": 0.9498649759304919, + "grad_norm": 0.08817755430936813, + "learning_rate": 5.4175536044229555e-05, + "loss": 0.0487, + "step": 16180 + }, + { + "epoch": 0.950452037102266, + "grad_norm": 1.633805274963379, + "learning_rate": 5.412945238363643e-05, + "loss": 0.0301, + "step": 16190 + }, + { + "epoch": 0.9510390982740402, + "grad_norm": 2.0507123470306396, + "learning_rate": 5.408336519078688e-05, + "loss": 0.0401, + "step": 16200 + }, + { + "epoch": 0.9516261594458143, + "grad_norm": 1.2019133567810059, + "learning_rate": 5.403727450510304e-05, + "loss": 0.0485, + "step": 16210 + }, + { + "epoch": 0.9522132206175884, + "grad_norm": 2.6347334384918213, + "learning_rate": 5.399118036601001e-05, + "loss": 0.0501, + "step": 16220 + }, + { + "epoch": 0.9528002817893625, + "grad_norm": 0.49063390493392944, + "learning_rate": 5.3945082812935857e-05, + "loss": 0.0157, + "step": 16230 + }, + { + "epoch": 0.9533873429611366, + "grad_norm": 0.9728673696517944, + "learning_rate": 5.389898188531156e-05, + "loss": 0.0261, + "step": 16240 + }, + { + "epoch": 0.9539744041329107, + "grad_norm": 1.6232997179031372, + "learning_rate": 5.385287762257101e-05, + "loss": 0.0252, + "step": 16250 + }, + { + "epoch": 0.9545614653046848, + "grad_norm": 3.001466751098633, + "learning_rate": 5.380677006415093e-05, + "loss": 0.0366, + "step": 16260 + }, + { + "epoch": 0.9551485264764589, + "grad_norm": 4.364900588989258, + "learning_rate": 5.376065924949083e-05, + "loss": 0.0494, + "step": 16270 + }, + { + "epoch": 0.955735587648233, + "grad_norm": 1.7932524681091309, + "learning_rate": 5.3714545218033076e-05, + "loss": 0.0382, + "step": 16280 + }, + { + "epoch": 0.9563226488200071, + "grad_norm": 2.1642203330993652, + "learning_rate": 5.366842800922274e-05, + "loss": 0.0172, + "step": 16290 + }, + { + "epoch": 0.9569097099917812, + "grad_norm": 1.7561495304107666, + "learning_rate": 5.362230766250761e-05, + "loss": 0.0262, + "step": 16300 + }, + { + "epoch": 0.9574967711635552, + "grad_norm": 0.9882838129997253, + "learning_rate": 5.3576184217338185e-05, + "loss": 0.0287, + "step": 16310 + }, + { + "epoch": 0.9580838323353293, + "grad_norm": 1.2872587442398071, + "learning_rate": 5.35300577131676e-05, + "loss": 0.0305, + "step": 16320 + }, + { + "epoch": 0.9586708935071034, + "grad_norm": 3.7865941524505615, + "learning_rate": 5.3483928189451585e-05, + "loss": 0.042, + "step": 16330 + }, + { + "epoch": 0.9592579546788775, + "grad_norm": 0.8969483375549316, + "learning_rate": 5.343779568564848e-05, + "loss": 0.0551, + "step": 16340 + }, + { + "epoch": 0.9598450158506516, + "grad_norm": 3.019232988357544, + "learning_rate": 5.339166024121919e-05, + "loss": 0.015, + "step": 16350 + }, + { + "epoch": 0.9604320770224257, + "grad_norm": 0.903888463973999, + "learning_rate": 5.334552189562707e-05, + "loss": 0.0369, + "step": 16360 + }, + { + "epoch": 0.9610191381941998, + "grad_norm": 2.1453328132629395, + "learning_rate": 5.329938068833803e-05, + "loss": 0.0372, + "step": 16370 + }, + { + "epoch": 0.9616061993659739, + "grad_norm": 1.4702128171920776, + "learning_rate": 5.3253236658820396e-05, + "loss": 0.0214, + "step": 16380 + }, + { + "epoch": 0.962193260537748, + "grad_norm": 0.5381457805633545, + "learning_rate": 5.320708984654489e-05, + "loss": 0.0245, + "step": 16390 + }, + { + "epoch": 0.9627803217095221, + "grad_norm": 1.984392523765564, + "learning_rate": 5.316094029098465e-05, + "loss": 0.0327, + "step": 16400 + }, + { + "epoch": 0.9633673828812962, + "grad_norm": 1.3957103490829468, + "learning_rate": 5.311478803161513e-05, + "loss": 0.0307, + "step": 16410 + }, + { + "epoch": 0.9639544440530703, + "grad_norm": 3.6839656829833984, + "learning_rate": 5.306863310791411e-05, + "loss": 0.0268, + "step": 16420 + }, + { + "epoch": 0.9645415052248444, + "grad_norm": 1.6362731456756592, + "learning_rate": 5.302247555936168e-05, + "loss": 0.0291, + "step": 16430 + }, + { + "epoch": 0.9651285663966185, + "grad_norm": 2.45932936668396, + "learning_rate": 5.2976315425440123e-05, + "loss": 0.0424, + "step": 16440 + }, + { + "epoch": 0.9657156275683926, + "grad_norm": 1.0357054471969604, + "learning_rate": 5.293015274563394e-05, + "loss": 0.0336, + "step": 16450 + }, + { + "epoch": 0.9663026887401667, + "grad_norm": 3.641936779022217, + "learning_rate": 5.288398755942985e-05, + "loss": 0.0303, + "step": 16460 + }, + { + "epoch": 0.9668897499119409, + "grad_norm": 2.652996778488159, + "learning_rate": 5.283781990631668e-05, + "loss": 0.0325, + "step": 16470 + }, + { + "epoch": 0.967476811083715, + "grad_norm": 2.6950008869171143, + "learning_rate": 5.279164982578536e-05, + "loss": 0.0167, + "step": 16480 + }, + { + "epoch": 0.9680638722554891, + "grad_norm": 2.224520444869995, + "learning_rate": 5.2745477357328955e-05, + "loss": 0.0267, + "step": 16490 + }, + { + "epoch": 0.9686509334272632, + "grad_norm": 0.9650734066963196, + "learning_rate": 5.26993025404425e-05, + "loss": 0.0111, + "step": 16500 + }, + { + "epoch": 0.9692379945990373, + "grad_norm": 3.18520450592041, + "learning_rate": 5.265312541462308e-05, + "loss": 0.037, + "step": 16510 + }, + { + "epoch": 0.9698250557708114, + "grad_norm": 3.1743884086608887, + "learning_rate": 5.260694601936975e-05, + "loss": 0.0426, + "step": 16520 + }, + { + "epoch": 0.9704121169425854, + "grad_norm": 3.833486557006836, + "learning_rate": 5.2560764394183494e-05, + "loss": 0.0479, + "step": 16530 + }, + { + "epoch": 0.9709991781143595, + "grad_norm": 0.6492809653282166, + "learning_rate": 5.2514580578567216e-05, + "loss": 0.0337, + "step": 16540 + }, + { + "epoch": 0.9715862392861336, + "grad_norm": 2.115954875946045, + "learning_rate": 5.2468394612025686e-05, + "loss": 0.0286, + "step": 16550 + }, + { + "epoch": 0.9721733004579077, + "grad_norm": 0.4123615324497223, + "learning_rate": 5.242220653406553e-05, + "loss": 0.0276, + "step": 16560 + }, + { + "epoch": 0.9727603616296818, + "grad_norm": 1.5601547956466675, + "learning_rate": 5.2376016384195136e-05, + "loss": 0.0153, + "step": 16570 + }, + { + "epoch": 0.9733474228014559, + "grad_norm": 0.18316815793514252, + "learning_rate": 5.232982420192474e-05, + "loss": 0.0352, + "step": 16580 + }, + { + "epoch": 0.97393448397323, + "grad_norm": 1.990715503692627, + "learning_rate": 5.2283630026766225e-05, + "loss": 0.0365, + "step": 16590 + }, + { + "epoch": 0.9745215451450041, + "grad_norm": 1.6039259433746338, + "learning_rate": 5.223743389823327e-05, + "loss": 0.0394, + "step": 16600 + }, + { + "epoch": 0.9751086063167782, + "grad_norm": 0.5257664322853088, + "learning_rate": 5.2191235855841146e-05, + "loss": 0.0366, + "step": 16610 + }, + { + "epoch": 0.9756956674885523, + "grad_norm": 0.4390442371368408, + "learning_rate": 5.21450359391068e-05, + "loss": 0.0357, + "step": 16620 + }, + { + "epoch": 0.9762827286603264, + "grad_norm": 1.3897521495819092, + "learning_rate": 5.2098834187548805e-05, + "loss": 0.0362, + "step": 16630 + }, + { + "epoch": 0.9768697898321005, + "grad_norm": 7.759096622467041, + "learning_rate": 5.205263064068725e-05, + "loss": 0.0402, + "step": 16640 + }, + { + "epoch": 0.9774568510038746, + "grad_norm": 2.1783852577209473, + "learning_rate": 5.200642533804379e-05, + "loss": 0.0263, + "step": 16650 + }, + { + "epoch": 0.9780439121756487, + "grad_norm": 2.72682785987854, + "learning_rate": 5.196021831914157e-05, + "loss": 0.029, + "step": 16660 + }, + { + "epoch": 0.9786309733474228, + "grad_norm": 1.675666093826294, + "learning_rate": 5.191400962350523e-05, + "loss": 0.0346, + "step": 16670 + }, + { + "epoch": 0.9792180345191969, + "grad_norm": 0.6754209995269775, + "learning_rate": 5.1867799290660815e-05, + "loss": 0.0467, + "step": 16680 + }, + { + "epoch": 0.979805095690971, + "grad_norm": 0.1948157697916031, + "learning_rate": 5.182158736013577e-05, + "loss": 0.0374, + "step": 16690 + }, + { + "epoch": 0.9803921568627451, + "grad_norm": 0.401326984167099, + "learning_rate": 5.177537387145894e-05, + "loss": 0.0579, + "step": 16700 + }, + { + "epoch": 0.9809792180345192, + "grad_norm": 2.034702777862549, + "learning_rate": 5.1729158864160466e-05, + "loss": 0.0561, + "step": 16710 + }, + { + "epoch": 0.9815662792062932, + "grad_norm": 3.014551877975464, + "learning_rate": 5.16829423777718e-05, + "loss": 0.0195, + "step": 16720 + }, + { + "epoch": 0.9821533403780673, + "grad_norm": 0.5864536166191101, + "learning_rate": 5.163672445182568e-05, + "loss": 0.0139, + "step": 16730 + }, + { + "epoch": 0.9827404015498415, + "grad_norm": 0.5289451479911804, + "learning_rate": 5.1590505125856025e-05, + "loss": 0.0288, + "step": 16740 + }, + { + "epoch": 0.9833274627216156, + "grad_norm": 1.2977737188339233, + "learning_rate": 5.1544284439398006e-05, + "loss": 0.0436, + "step": 16750 + }, + { + "epoch": 0.9839145238933897, + "grad_norm": 2.5961172580718994, + "learning_rate": 5.149806243198794e-05, + "loss": 0.0236, + "step": 16760 + }, + { + "epoch": 0.9845015850651638, + "grad_norm": 1.1894290447235107, + "learning_rate": 5.1451839143163226e-05, + "loss": 0.0288, + "step": 16770 + }, + { + "epoch": 0.9850886462369379, + "grad_norm": 0.8295955657958984, + "learning_rate": 5.140561461246246e-05, + "loss": 0.0374, + "step": 16780 + }, + { + "epoch": 0.985675707408712, + "grad_norm": 3.0590317249298096, + "learning_rate": 5.13593888794252e-05, + "loss": 0.0418, + "step": 16790 + }, + { + "epoch": 0.9862627685804861, + "grad_norm": 1.7654225826263428, + "learning_rate": 5.1313161983592096e-05, + "loss": 0.0246, + "step": 16800 + }, + { + "epoch": 0.9868498297522602, + "grad_norm": 2.3438401222229004, + "learning_rate": 5.126693396450476e-05, + "loss": 0.0398, + "step": 16810 + }, + { + "epoch": 0.9874368909240343, + "grad_norm": 3.1865170001983643, + "learning_rate": 5.1220704861705774e-05, + "loss": 0.0475, + "step": 16820 + }, + { + "epoch": 0.9880239520958084, + "grad_norm": 0.756794273853302, + "learning_rate": 5.117447471473865e-05, + "loss": 0.0131, + "step": 16830 + }, + { + "epoch": 0.9886110132675825, + "grad_norm": 3.404741048812866, + "learning_rate": 5.1128243563147816e-05, + "loss": 0.0352, + "step": 16840 + }, + { + "epoch": 0.9891980744393566, + "grad_norm": 3.5811452865600586, + "learning_rate": 5.108201144647851e-05, + "loss": 0.0219, + "step": 16850 + }, + { + "epoch": 0.9897851356111307, + "grad_norm": 2.3117177486419678, + "learning_rate": 5.1035778404276815e-05, + "loss": 0.0413, + "step": 16860 + }, + { + "epoch": 0.9903721967829048, + "grad_norm": 1.9933563470840454, + "learning_rate": 5.098954447608964e-05, + "loss": 0.0286, + "step": 16870 + }, + { + "epoch": 0.9909592579546789, + "grad_norm": 2.5051896572113037, + "learning_rate": 5.0943309701464584e-05, + "loss": 0.0225, + "step": 16880 + }, + { + "epoch": 0.991546319126453, + "grad_norm": 2.6229381561279297, + "learning_rate": 5.089707411995005e-05, + "loss": 0.0341, + "step": 16890 + }, + { + "epoch": 0.9921333802982271, + "grad_norm": 1.0904333591461182, + "learning_rate": 5.0850837771095074e-05, + "loss": 0.0527, + "step": 16900 + }, + { + "epoch": 0.9927204414700012, + "grad_norm": 1.3770349025726318, + "learning_rate": 5.080460069444936e-05, + "loss": 0.0301, + "step": 16910 + }, + { + "epoch": 0.9933075026417753, + "grad_norm": 1.2007999420166016, + "learning_rate": 5.0758362929563244e-05, + "loss": 0.0345, + "step": 16920 + }, + { + "epoch": 0.9938945638135493, + "grad_norm": 2.2936861515045166, + "learning_rate": 5.071212451598765e-05, + "loss": 0.0531, + "step": 16930 + }, + { + "epoch": 0.9944816249853234, + "grad_norm": 2.625542640686035, + "learning_rate": 5.066588549327403e-05, + "loss": 0.0427, + "step": 16940 + }, + { + "epoch": 0.9950686861570975, + "grad_norm": 3.829827308654785, + "learning_rate": 5.061964590097442e-05, + "loss": 0.043, + "step": 16950 + }, + { + "epoch": 0.9956557473288716, + "grad_norm": 1.3408961296081543, + "learning_rate": 5.057340577864127e-05, + "loss": 0.0471, + "step": 16960 + }, + { + "epoch": 0.9962428085006457, + "grad_norm": 1.6706929206848145, + "learning_rate": 5.052716516582753e-05, + "loss": 0.0285, + "step": 16970 + }, + { + "epoch": 0.9968298696724198, + "grad_norm": 0.8629145622253418, + "learning_rate": 5.048092410208656e-05, + "loss": 0.0317, + "step": 16980 + }, + { + "epoch": 0.9974169308441939, + "grad_norm": 0.9367380738258362, + "learning_rate": 5.0434682626972105e-05, + "loss": 0.0344, + "step": 16990 + }, + { + "epoch": 0.998003992015968, + "grad_norm": 2.3083112239837646, + "learning_rate": 5.0388440780038235e-05, + "loss": 0.0218, + "step": 17000 + }, + { + "epoch": 0.9985910531877422, + "grad_norm": 1.1551401615142822, + "learning_rate": 5.0342198600839394e-05, + "loss": 0.0372, + "step": 17010 + }, + { + "epoch": 0.9991781143595163, + "grad_norm": 1.9685473442077637, + "learning_rate": 5.029595612893027e-05, + "loss": 0.0516, + "step": 17020 + }, + { + "epoch": 0.9997651755312904, + "grad_norm": 2.787243127822876, + "learning_rate": 5.024971340386577e-05, + "loss": 0.0446, + "step": 17030 + }, + { + "epoch": 1.0003522367030644, + "grad_norm": 1.3453413248062134, + "learning_rate": 5.020347046520112e-05, + "loss": 0.0236, + "step": 17040 + }, + { + "epoch": 1.0009392978748386, + "grad_norm": 0.5187767148017883, + "learning_rate": 5.015722735249163e-05, + "loss": 0.0238, + "step": 17050 + }, + { + "epoch": 1.0015263590466126, + "grad_norm": 0.7118642330169678, + "learning_rate": 5.0110984105292793e-05, + "loss": 0.0219, + "step": 17060 + }, + { + "epoch": 1.0021134202183868, + "grad_norm": 0.29987266659736633, + "learning_rate": 5.0064740763160224e-05, + "loss": 0.0198, + "step": 17070 + }, + { + "epoch": 1.0027004813901608, + "grad_norm": 1.877467155456543, + "learning_rate": 5.001849736564961e-05, + "loss": 0.0416, + "step": 17080 + }, + { + "epoch": 1.003287542561935, + "grad_norm": 0.24835659563541412, + "learning_rate": 4.99722539523167e-05, + "loss": 0.0258, + "step": 17090 + }, + { + "epoch": 1.003874603733709, + "grad_norm": 1.0357491970062256, + "learning_rate": 4.9926010562717255e-05, + "loss": 0.0224, + "step": 17100 + }, + { + "epoch": 1.0044616649054832, + "grad_norm": 2.34885311126709, + "learning_rate": 4.987976723640698e-05, + "loss": 0.0048, + "step": 17110 + }, + { + "epoch": 1.0050487260772571, + "grad_norm": 0.7573207020759583, + "learning_rate": 4.983352401294157e-05, + "loss": 0.0131, + "step": 17120 + }, + { + "epoch": 1.0056357872490314, + "grad_norm": 1.1532341241836548, + "learning_rate": 4.97872809318766e-05, + "loss": 0.013, + "step": 17130 + }, + { + "epoch": 1.0062228484208053, + "grad_norm": 0.11428312957286835, + "learning_rate": 4.974103803276756e-05, + "loss": 0.0192, + "step": 17140 + }, + { + "epoch": 1.0068099095925795, + "grad_norm": 0.08842333406209946, + "learning_rate": 4.9694795355169734e-05, + "loss": 0.0042, + "step": 17150 + }, + { + "epoch": 1.0073969707643537, + "grad_norm": 1.681483507156372, + "learning_rate": 4.964855293863828e-05, + "loss": 0.0105, + "step": 17160 + }, + { + "epoch": 1.0079840319361277, + "grad_norm": 1.8980822563171387, + "learning_rate": 4.960231082272805e-05, + "loss": 0.016, + "step": 17170 + }, + { + "epoch": 1.008571093107902, + "grad_norm": 4.2246246337890625, + "learning_rate": 4.955606904699371e-05, + "loss": 0.0209, + "step": 17180 + }, + { + "epoch": 1.009158154279676, + "grad_norm": 0.0430564284324646, + "learning_rate": 4.950982765098965e-05, + "loss": 0.013, + "step": 17190 + }, + { + "epoch": 1.0097452154514501, + "grad_norm": 0.5098704099655151, + "learning_rate": 4.946358667426984e-05, + "loss": 0.015, + "step": 17200 + }, + { + "epoch": 1.0103322766232241, + "grad_norm": 1.0906519889831543, + "learning_rate": 4.941734615638797e-05, + "loss": 0.0214, + "step": 17210 + }, + { + "epoch": 1.0109193377949983, + "grad_norm": 3.039210081100464, + "learning_rate": 4.93711061368973e-05, + "loss": 0.0182, + "step": 17220 + }, + { + "epoch": 1.0115063989667723, + "grad_norm": 2.438016176223755, + "learning_rate": 4.9324866655350706e-05, + "loss": 0.0399, + "step": 17230 + }, + { + "epoch": 1.0120934601385465, + "grad_norm": 1.5391802787780762, + "learning_rate": 4.927862775130055e-05, + "loss": 0.0259, + "step": 17240 + }, + { + "epoch": 1.0126805213103205, + "grad_norm": 0.10367631912231445, + "learning_rate": 4.923238946429876e-05, + "loss": 0.0084, + "step": 17250 + }, + { + "epoch": 1.0132675824820947, + "grad_norm": 0.026676097884774208, + "learning_rate": 4.918615183389665e-05, + "loss": 0.0143, + "step": 17260 + }, + { + "epoch": 1.0138546436538687, + "grad_norm": 0.3782263994216919, + "learning_rate": 4.9139914899645096e-05, + "loss": 0.0095, + "step": 17270 + }, + { + "epoch": 1.0144417048256429, + "grad_norm": 1.0386979579925537, + "learning_rate": 4.909367870109424e-05, + "loss": 0.0133, + "step": 17280 + }, + { + "epoch": 1.0150287659974169, + "grad_norm": 1.7258808612823486, + "learning_rate": 4.90474432777937e-05, + "loss": 0.0163, + "step": 17290 + }, + { + "epoch": 1.015615827169191, + "grad_norm": 0.11780045926570892, + "learning_rate": 4.900120866929238e-05, + "loss": 0.014, + "step": 17300 + }, + { + "epoch": 1.016202888340965, + "grad_norm": 0.9828274846076965, + "learning_rate": 4.89549749151385e-05, + "loss": 0.0162, + "step": 17310 + }, + { + "epoch": 1.0167899495127393, + "grad_norm": 0.4108709692955017, + "learning_rate": 4.890874205487957e-05, + "loss": 0.032, + "step": 17320 + }, + { + "epoch": 1.0173770106845132, + "grad_norm": 0.6657062768936157, + "learning_rate": 4.8862510128062284e-05, + "loss": 0.0198, + "step": 17330 + }, + { + "epoch": 1.0179640718562875, + "grad_norm": 0.655598521232605, + "learning_rate": 4.881627917423261e-05, + "loss": 0.0099, + "step": 17340 + }, + { + "epoch": 1.0185511330280614, + "grad_norm": 0.5765482783317566, + "learning_rate": 4.8770049232935575e-05, + "loss": 0.012, + "step": 17350 + }, + { + "epoch": 1.0191381941998356, + "grad_norm": 0.2732393145561218, + "learning_rate": 4.8723820343715484e-05, + "loss": 0.0265, + "step": 17360 + }, + { + "epoch": 1.0197252553716096, + "grad_norm": 1.265217661857605, + "learning_rate": 4.867759254611561e-05, + "loss": 0.0215, + "step": 17370 + }, + { + "epoch": 1.0203123165433838, + "grad_norm": 2.5435831546783447, + "learning_rate": 4.8631365879678384e-05, + "loss": 0.018, + "step": 17380 + }, + { + "epoch": 1.0208993777151578, + "grad_norm": 1.1228276491165161, + "learning_rate": 4.85851403839452e-05, + "loss": 0.0135, + "step": 17390 + }, + { + "epoch": 1.021486438886932, + "grad_norm": 1.6964856386184692, + "learning_rate": 4.85389160984565e-05, + "loss": 0.0135, + "step": 17400 + }, + { + "epoch": 1.022073500058706, + "grad_norm": 2.8319220542907715, + "learning_rate": 4.8492693062751675e-05, + "loss": 0.0078, + "step": 17410 + }, + { + "epoch": 1.0226605612304802, + "grad_norm": 1.0305736064910889, + "learning_rate": 4.844647131636907e-05, + "loss": 0.012, + "step": 17420 + }, + { + "epoch": 1.0232476224022544, + "grad_norm": 0.9862768650054932, + "learning_rate": 4.840025089884583e-05, + "loss": 0.0285, + "step": 17430 + }, + { + "epoch": 1.0238346835740284, + "grad_norm": 1.3408020734786987, + "learning_rate": 4.8354031849718126e-05, + "loss": 0.0184, + "step": 17440 + }, + { + "epoch": 1.0244217447458026, + "grad_norm": 0.399573415517807, + "learning_rate": 4.8307814208520806e-05, + "loss": 0.0214, + "step": 17450 + }, + { + "epoch": 1.0250088059175766, + "grad_norm": 3.679910182952881, + "learning_rate": 4.82615980147876e-05, + "loss": 0.0093, + "step": 17460 + }, + { + "epoch": 1.0255958670893508, + "grad_norm": 1.3835197687149048, + "learning_rate": 4.821538330805098e-05, + "loss": 0.0151, + "step": 17470 + }, + { + "epoch": 1.0261829282611248, + "grad_norm": 0.055284079164266586, + "learning_rate": 4.816917012784213e-05, + "loss": 0.0202, + "step": 17480 + }, + { + "epoch": 1.026769989432899, + "grad_norm": 0.9927650094032288, + "learning_rate": 4.812295851369096e-05, + "loss": 0.0129, + "step": 17490 + }, + { + "epoch": 1.027357050604673, + "grad_norm": 1.693311333656311, + "learning_rate": 4.807674850512601e-05, + "loss": 0.0172, + "step": 17500 + }, + { + "epoch": 1.0279441117764472, + "grad_norm": 1.8846806287765503, + "learning_rate": 4.803054014167447e-05, + "loss": 0.0226, + "step": 17510 + }, + { + "epoch": 1.0285311729482212, + "grad_norm": 1.4450979232788086, + "learning_rate": 4.7984333462862066e-05, + "loss": 0.0227, + "step": 17520 + }, + { + "epoch": 1.0291182341199954, + "grad_norm": 0.4159143567085266, + "learning_rate": 4.793812850821319e-05, + "loss": 0.0224, + "step": 17530 + }, + { + "epoch": 1.0297052952917694, + "grad_norm": 0.6583952307701111, + "learning_rate": 4.789192531725066e-05, + "loss": 0.0194, + "step": 17540 + }, + { + "epoch": 1.0302923564635436, + "grad_norm": 1.147028923034668, + "learning_rate": 4.784572392949583e-05, + "loss": 0.0385, + "step": 17550 + }, + { + "epoch": 1.0308794176353175, + "grad_norm": 1.676843523979187, + "learning_rate": 4.77995243844685e-05, + "loss": 0.023, + "step": 17560 + }, + { + "epoch": 1.0314664788070917, + "grad_norm": 2.3612804412841797, + "learning_rate": 4.775332672168691e-05, + "loss": 0.016, + "step": 17570 + }, + { + "epoch": 1.0320535399788657, + "grad_norm": 0.7377216815948486, + "learning_rate": 4.770713098066765e-05, + "loss": 0.0137, + "step": 17580 + }, + { + "epoch": 1.03264060115064, + "grad_norm": 0.525375485420227, + "learning_rate": 4.7660937200925726e-05, + "loss": 0.0116, + "step": 17590 + }, + { + "epoch": 1.033227662322414, + "grad_norm": 0.29516294598579407, + "learning_rate": 4.7614745421974447e-05, + "loss": 0.0085, + "step": 17600 + }, + { + "epoch": 1.0338147234941881, + "grad_norm": 1.0876867771148682, + "learning_rate": 4.7568555683325325e-05, + "loss": 0.028, + "step": 17610 + }, + { + "epoch": 1.034401784665962, + "grad_norm": 2.252126932144165, + "learning_rate": 4.752236802448829e-05, + "loss": 0.0176, + "step": 17620 + }, + { + "epoch": 1.0349888458377363, + "grad_norm": 0.04606296122074127, + "learning_rate": 4.747618248497134e-05, + "loss": 0.0158, + "step": 17630 + }, + { + "epoch": 1.0355759070095103, + "grad_norm": 2.971982717514038, + "learning_rate": 4.742999910428075e-05, + "loss": 0.0142, + "step": 17640 + }, + { + "epoch": 1.0361629681812845, + "grad_norm": 0.5577654838562012, + "learning_rate": 4.73838179219209e-05, + "loss": 0.0238, + "step": 17650 + }, + { + "epoch": 1.0367500293530585, + "grad_norm": 0.7843875288963318, + "learning_rate": 4.7337638977394336e-05, + "loss": 0.0108, + "step": 17660 + }, + { + "epoch": 1.0373370905248327, + "grad_norm": 0.3187924325466156, + "learning_rate": 4.729146231020164e-05, + "loss": 0.0183, + "step": 17670 + }, + { + "epoch": 1.0379241516966067, + "grad_norm": 2.111429214477539, + "learning_rate": 4.724528795984151e-05, + "loss": 0.0226, + "step": 17680 + }, + { + "epoch": 1.0385112128683809, + "grad_norm": 0.011057616211473942, + "learning_rate": 4.719911596581057e-05, + "loss": 0.0071, + "step": 17690 + }, + { + "epoch": 1.039098274040155, + "grad_norm": 1.7219300270080566, + "learning_rate": 4.715294636760352e-05, + "loss": 0.0129, + "step": 17700 + }, + { + "epoch": 1.039685335211929, + "grad_norm": 0.33654457330703735, + "learning_rate": 4.7106779204712946e-05, + "loss": 0.0133, + "step": 17710 + }, + { + "epoch": 1.0402723963837033, + "grad_norm": 0.4673077464103699, + "learning_rate": 4.7060614516629396e-05, + "loss": 0.0127, + "step": 17720 + }, + { + "epoch": 1.0408594575554773, + "grad_norm": 1.9815365076065063, + "learning_rate": 4.701445234284127e-05, + "loss": 0.0263, + "step": 17730 + }, + { + "epoch": 1.0414465187272515, + "grad_norm": 1.209466814994812, + "learning_rate": 4.696829272283483e-05, + "loss": 0.0153, + "step": 17740 + }, + { + "epoch": 1.0420335798990255, + "grad_norm": 1.3966259956359863, + "learning_rate": 4.6922135696094175e-05, + "loss": 0.0092, + "step": 17750 + }, + { + "epoch": 1.0426206410707997, + "grad_norm": 1.4317821264266968, + "learning_rate": 4.687598130210112e-05, + "loss": 0.0253, + "step": 17760 + }, + { + "epoch": 1.0432077022425736, + "grad_norm": 0.2546725273132324, + "learning_rate": 4.682982958033533e-05, + "loss": 0.016, + "step": 17770 + }, + { + "epoch": 1.0437947634143478, + "grad_norm": 2.701737880706787, + "learning_rate": 4.678368057027407e-05, + "loss": 0.0245, + "step": 17780 + }, + { + "epoch": 1.0443818245861218, + "grad_norm": 2.189082145690918, + "learning_rate": 4.6737534311392375e-05, + "loss": 0.0196, + "step": 17790 + }, + { + "epoch": 1.044968885757896, + "grad_norm": 4.402464389801025, + "learning_rate": 4.669139084316286e-05, + "loss": 0.0162, + "step": 17800 + }, + { + "epoch": 1.04555594692967, + "grad_norm": 1.7337652444839478, + "learning_rate": 4.664525020505582e-05, + "loss": 0.0258, + "step": 17810 + }, + { + "epoch": 1.0461430081014442, + "grad_norm": 0.08800406754016876, + "learning_rate": 4.6599112436539075e-05, + "loss": 0.012, + "step": 17820 + }, + { + "epoch": 1.0467300692732182, + "grad_norm": 1.18183434009552, + "learning_rate": 4.6552977577078035e-05, + "loss": 0.0123, + "step": 17830 + }, + { + "epoch": 1.0473171304449924, + "grad_norm": 0.5991613268852234, + "learning_rate": 4.6506845666135546e-05, + "loss": 0.0137, + "step": 17840 + }, + { + "epoch": 1.0479041916167664, + "grad_norm": 0.9787329435348511, + "learning_rate": 4.646071674317204e-05, + "loss": 0.0205, + "step": 17850 + }, + { + "epoch": 1.0484912527885406, + "grad_norm": 4.429852485656738, + "learning_rate": 4.6414590847645305e-05, + "loss": 0.0099, + "step": 17860 + }, + { + "epoch": 1.0490783139603146, + "grad_norm": 0.5103155970573425, + "learning_rate": 4.636846801901056e-05, + "loss": 0.0348, + "step": 17870 + }, + { + "epoch": 1.0496653751320888, + "grad_norm": 2.0082802772521973, + "learning_rate": 4.632234829672045e-05, + "loss": 0.0172, + "step": 17880 + }, + { + "epoch": 1.0502524363038628, + "grad_norm": 0.44900891184806824, + "learning_rate": 4.6276231720224885e-05, + "loss": 0.0139, + "step": 17890 + }, + { + "epoch": 1.050839497475637, + "grad_norm": 2.634812831878662, + "learning_rate": 4.6230118328971156e-05, + "loss": 0.0254, + "step": 17900 + }, + { + "epoch": 1.051426558647411, + "grad_norm": 1.2553977966308594, + "learning_rate": 4.618400816240376e-05, + "loss": 0.0131, + "step": 17910 + }, + { + "epoch": 1.0520136198191852, + "grad_norm": 0.8278115391731262, + "learning_rate": 4.613790125996451e-05, + "loss": 0.023, + "step": 17920 + }, + { + "epoch": 1.0526006809909592, + "grad_norm": 0.04974241927266121, + "learning_rate": 4.609179766109236e-05, + "loss": 0.0121, + "step": 17930 + }, + { + "epoch": 1.0531877421627334, + "grad_norm": 0.170747309923172, + "learning_rate": 4.604569740522349e-05, + "loss": 0.0142, + "step": 17940 + }, + { + "epoch": 1.0537748033345073, + "grad_norm": 0.12599213421344757, + "learning_rate": 4.599960053179117e-05, + "loss": 0.0204, + "step": 17950 + }, + { + "epoch": 1.0543618645062816, + "grad_norm": 1.180310845375061, + "learning_rate": 4.595350708022583e-05, + "loss": 0.0297, + "step": 17960 + }, + { + "epoch": 1.0549489256780558, + "grad_norm": 0.9493981599807739, + "learning_rate": 4.5907417089954926e-05, + "loss": 0.0064, + "step": 17970 + }, + { + "epoch": 1.0555359868498297, + "grad_norm": 2.6346616744995117, + "learning_rate": 4.5861330600403e-05, + "loss": 0.0104, + "step": 17980 + }, + { + "epoch": 1.056123048021604, + "grad_norm": 1.2605183124542236, + "learning_rate": 4.581524765099154e-05, + "loss": 0.0128, + "step": 17990 + }, + { + "epoch": 1.056710109193378, + "grad_norm": 0.6190035939216614, + "learning_rate": 4.5769168281139066e-05, + "loss": 0.0227, + "step": 18000 + }, + { + "epoch": 1.056710109193378, + "eval_loss": 0.4979991912841797, + "eval_runtime": 269.5923, + "eval_samples_per_second": 3.505, + "eval_steps_per_second": 3.505, + "step": 18000 + }, + { + "epoch": 1.0572971703651521, + "grad_norm": 0.5389529466629028, + "learning_rate": 4.572309253026101e-05, + "loss": 0.0098, + "step": 18010 + }, + { + "epoch": 1.0578842315369261, + "grad_norm": 1.576887607574463, + "learning_rate": 4.56770204377697e-05, + "loss": 0.025, + "step": 18020 + }, + { + "epoch": 1.0584712927087003, + "grad_norm": 1.8304966688156128, + "learning_rate": 4.5630952043074356e-05, + "loss": 0.0161, + "step": 18030 + }, + { + "epoch": 1.0590583538804743, + "grad_norm": 0.27343544363975525, + "learning_rate": 4.5584887385581e-05, + "loss": 0.0142, + "step": 18040 + }, + { + "epoch": 1.0596454150522485, + "grad_norm": 2.4599697589874268, + "learning_rate": 4.5538826504692496e-05, + "loss": 0.0203, + "step": 18050 + }, + { + "epoch": 1.0602324762240225, + "grad_norm": 0.04068051651120186, + "learning_rate": 4.549276943980845e-05, + "loss": 0.0294, + "step": 18060 + }, + { + "epoch": 1.0608195373957967, + "grad_norm": 0.5054083466529846, + "learning_rate": 4.544671623032522e-05, + "loss": 0.008, + "step": 18070 + }, + { + "epoch": 1.0614065985675707, + "grad_norm": 0.6975878477096558, + "learning_rate": 4.540066691563587e-05, + "loss": 0.007, + "step": 18080 + }, + { + "epoch": 1.061993659739345, + "grad_norm": 0.965081512928009, + "learning_rate": 4.535462153513012e-05, + "loss": 0.0126, + "step": 18090 + }, + { + "epoch": 1.0625807209111189, + "grad_norm": 0.7705340385437012, + "learning_rate": 4.53085801281943e-05, + "loss": 0.0104, + "step": 18100 + }, + { + "epoch": 1.063167782082893, + "grad_norm": 0.8081434369087219, + "learning_rate": 4.526254273421143e-05, + "loss": 0.0131, + "step": 18110 + }, + { + "epoch": 1.063754843254667, + "grad_norm": 0.7814608812332153, + "learning_rate": 4.521650939256097e-05, + "loss": 0.0089, + "step": 18120 + }, + { + "epoch": 1.0643419044264413, + "grad_norm": 0.01808895170688629, + "learning_rate": 4.517048014261902e-05, + "loss": 0.0145, + "step": 18130 + }, + { + "epoch": 1.0649289655982153, + "grad_norm": 1.0433502197265625, + "learning_rate": 4.512445502375813e-05, + "loss": 0.0093, + "step": 18140 + }, + { + "epoch": 1.0655160267699895, + "grad_norm": 0.3834342658519745, + "learning_rate": 4.507843407534732e-05, + "loss": 0.0124, + "step": 18150 + }, + { + "epoch": 1.0661030879417634, + "grad_norm": 0.3785993158817291, + "learning_rate": 4.503241733675207e-05, + "loss": 0.0299, + "step": 18160 + }, + { + "epoch": 1.0666901491135377, + "grad_norm": 0.4766404330730438, + "learning_rate": 4.498640484733421e-05, + "loss": 0.0167, + "step": 18170 + }, + { + "epoch": 1.0672772102853116, + "grad_norm": 1.4658899307250977, + "learning_rate": 4.494039664645201e-05, + "loss": 0.0163, + "step": 18180 + }, + { + "epoch": 1.0678642714570858, + "grad_norm": 1.9271597862243652, + "learning_rate": 4.4894392773459957e-05, + "loss": 0.0112, + "step": 18190 + }, + { + "epoch": 1.0684513326288598, + "grad_norm": 0.38466793298721313, + "learning_rate": 4.4848393267708974e-05, + "loss": 0.0179, + "step": 18200 + }, + { + "epoch": 1.069038393800634, + "grad_norm": 1.8587846755981445, + "learning_rate": 4.480239816854613e-05, + "loss": 0.0275, + "step": 18210 + }, + { + "epoch": 1.069625454972408, + "grad_norm": 0.025200162082910538, + "learning_rate": 4.4756407515314804e-05, + "loss": 0.0221, + "step": 18220 + }, + { + "epoch": 1.0702125161441822, + "grad_norm": 0.8217595815658569, + "learning_rate": 4.471042134735451e-05, + "loss": 0.018, + "step": 18230 + }, + { + "epoch": 1.0707995773159564, + "grad_norm": 0.7045888304710388, + "learning_rate": 4.466443970400099e-05, + "loss": 0.0091, + "step": 18240 + }, + { + "epoch": 1.0713866384877304, + "grad_norm": 2.5180864334106445, + "learning_rate": 4.461846262458606e-05, + "loss": 0.0254, + "step": 18250 + }, + { + "epoch": 1.0719736996595046, + "grad_norm": 0.5709795951843262, + "learning_rate": 4.4572490148437686e-05, + "loss": 0.005, + "step": 18260 + }, + { + "epoch": 1.0725607608312786, + "grad_norm": 3.664907693862915, + "learning_rate": 4.452652231487982e-05, + "loss": 0.0124, + "step": 18270 + }, + { + "epoch": 1.0731478220030528, + "grad_norm": 0.7889499664306641, + "learning_rate": 4.448055916323249e-05, + "loss": 0.0208, + "step": 18280 + }, + { + "epoch": 1.0737348831748268, + "grad_norm": 2.400569438934326, + "learning_rate": 4.443460073281178e-05, + "loss": 0.0147, + "step": 18290 + }, + { + "epoch": 1.074321944346601, + "grad_norm": 0.731299102306366, + "learning_rate": 4.43886470629296e-05, + "loss": 0.0149, + "step": 18300 + }, + { + "epoch": 1.074909005518375, + "grad_norm": 0.11451888084411621, + "learning_rate": 4.4342698192893904e-05, + "loss": 0.0157, + "step": 18310 + }, + { + "epoch": 1.0754960666901492, + "grad_norm": 2.5761590003967285, + "learning_rate": 4.429675416200848e-05, + "loss": 0.0304, + "step": 18320 + }, + { + "epoch": 1.0760831278619232, + "grad_norm": 1.9438437223434448, + "learning_rate": 4.4250815009573e-05, + "loss": 0.0141, + "step": 18330 + }, + { + "epoch": 1.0766701890336974, + "grad_norm": 0.7531579732894897, + "learning_rate": 4.420488077488295e-05, + "loss": 0.0378, + "step": 18340 + }, + { + "epoch": 1.0772572502054714, + "grad_norm": 2.7101943492889404, + "learning_rate": 4.415895149722964e-05, + "loss": 0.0142, + "step": 18350 + }, + { + "epoch": 1.0778443113772456, + "grad_norm": 0.9626311659812927, + "learning_rate": 4.411302721590007e-05, + "loss": 0.0174, + "step": 18360 + }, + { + "epoch": 1.0784313725490196, + "grad_norm": 0.23843607306480408, + "learning_rate": 4.406710797017706e-05, + "loss": 0.0104, + "step": 18370 + }, + { + "epoch": 1.0790184337207938, + "grad_norm": 1.6074140071868896, + "learning_rate": 4.402119379933904e-05, + "loss": 0.0164, + "step": 18380 + }, + { + "epoch": 1.0796054948925677, + "grad_norm": 0.29586324095726013, + "learning_rate": 4.3975284742660153e-05, + "loss": 0.0195, + "step": 18390 + }, + { + "epoch": 1.080192556064342, + "grad_norm": 1.8798375129699707, + "learning_rate": 4.392938083941014e-05, + "loss": 0.0043, + "step": 18400 + }, + { + "epoch": 1.080779617236116, + "grad_norm": 0.8807462453842163, + "learning_rate": 4.388348212885435e-05, + "loss": 0.0142, + "step": 18410 + }, + { + "epoch": 1.0813666784078901, + "grad_norm": 3.3922951221466064, + "learning_rate": 4.383758865025368e-05, + "loss": 0.0229, + "step": 18420 + }, + { + "epoch": 1.0819537395796641, + "grad_norm": 0.4539323151111603, + "learning_rate": 4.379170044286454e-05, + "loss": 0.0177, + "step": 18430 + }, + { + "epoch": 1.0825408007514383, + "grad_norm": 1.919954776763916, + "learning_rate": 4.3745817545938874e-05, + "loss": 0.0231, + "step": 18440 + }, + { + "epoch": 1.0831278619232123, + "grad_norm": 3.1497795581817627, + "learning_rate": 4.369993999872402e-05, + "loss": 0.024, + "step": 18450 + }, + { + "epoch": 1.0837149230949865, + "grad_norm": 0.14825113117694855, + "learning_rate": 4.365406784046282e-05, + "loss": 0.0156, + "step": 18460 + }, + { + "epoch": 1.0843019842667605, + "grad_norm": 1.4273031949996948, + "learning_rate": 4.360820111039341e-05, + "loss": 0.0109, + "step": 18470 + }, + { + "epoch": 1.0848890454385347, + "grad_norm": 0.04318694397807121, + "learning_rate": 4.3562339847749376e-05, + "loss": 0.0104, + "step": 18480 + }, + { + "epoch": 1.085476106610309, + "grad_norm": 0.4969497621059418, + "learning_rate": 4.3516484091759545e-05, + "loss": 0.0149, + "step": 18490 + }, + { + "epoch": 1.086063167782083, + "grad_norm": 2.145045042037964, + "learning_rate": 4.347063388164812e-05, + "loss": 0.0277, + "step": 18500 + }, + { + "epoch": 1.0866502289538569, + "grad_norm": 0.2778224050998688, + "learning_rate": 4.342478925663447e-05, + "loss": 0.0181, + "step": 18510 + }, + { + "epoch": 1.087237290125631, + "grad_norm": 4.340620994567871, + "learning_rate": 4.3378950255933284e-05, + "loss": 0.0125, + "step": 18520 + }, + { + "epoch": 1.0878243512974053, + "grad_norm": 3.725788116455078, + "learning_rate": 4.333311691875433e-05, + "loss": 0.0214, + "step": 18530 + }, + { + "epoch": 1.0884114124691793, + "grad_norm": 1.5069423913955688, + "learning_rate": 4.3287289284302615e-05, + "loss": 0.0317, + "step": 18540 + }, + { + "epoch": 1.0889984736409535, + "grad_norm": 0.7962707281112671, + "learning_rate": 4.324146739177824e-05, + "loss": 0.0153, + "step": 18550 + }, + { + "epoch": 1.0895855348127275, + "grad_norm": 0.17169342935085297, + "learning_rate": 4.319565128037639e-05, + "loss": 0.0155, + "step": 18560 + }, + { + "epoch": 1.0901725959845017, + "grad_norm": 1.0419448614120483, + "learning_rate": 4.314984098928733e-05, + "loss": 0.0094, + "step": 18570 + }, + { + "epoch": 1.0907596571562757, + "grad_norm": 0.0351492241024971, + "learning_rate": 4.3104036557696295e-05, + "loss": 0.0098, + "step": 18580 + }, + { + "epoch": 1.0913467183280499, + "grad_norm": 1.296635389328003, + "learning_rate": 4.305823802478357e-05, + "loss": 0.022, + "step": 18590 + }, + { + "epoch": 1.0919337794998238, + "grad_norm": 0.0528903566300869, + "learning_rate": 4.301244542972435e-05, + "loss": 0.0227, + "step": 18600 + }, + { + "epoch": 1.092520840671598, + "grad_norm": 1.5245438814163208, + "learning_rate": 4.2966658811688785e-05, + "loss": 0.0143, + "step": 18610 + }, + { + "epoch": 1.093107901843372, + "grad_norm": 0.5972466468811035, + "learning_rate": 4.292087820984185e-05, + "loss": 0.0081, + "step": 18620 + }, + { + "epoch": 1.0936949630151462, + "grad_norm": 1.2663933038711548, + "learning_rate": 4.287510366334346e-05, + "loss": 0.0132, + "step": 18630 + }, + { + "epoch": 1.0942820241869202, + "grad_norm": 1.925042986869812, + "learning_rate": 4.282933521134827e-05, + "loss": 0.018, + "step": 18640 + }, + { + "epoch": 1.0948690853586944, + "grad_norm": 0.5491015315055847, + "learning_rate": 4.2783572893005794e-05, + "loss": 0.021, + "step": 18650 + }, + { + "epoch": 1.0954561465304684, + "grad_norm": 0.7465754747390747, + "learning_rate": 4.273781674746023e-05, + "loss": 0.013, + "step": 18660 + }, + { + "epoch": 1.0960432077022426, + "grad_norm": 0.1883556991815567, + "learning_rate": 4.269206681385058e-05, + "loss": 0.0157, + "step": 18670 + }, + { + "epoch": 1.0966302688740166, + "grad_norm": 1.0537993907928467, + "learning_rate": 4.264632313131041e-05, + "loss": 0.0119, + "step": 18680 + }, + { + "epoch": 1.0972173300457908, + "grad_norm": 1.8062794208526611, + "learning_rate": 4.260058573896809e-05, + "loss": 0.0274, + "step": 18690 + }, + { + "epoch": 1.0978043912175648, + "grad_norm": 0.9525005221366882, + "learning_rate": 4.255485467594647e-05, + "loss": 0.0079, + "step": 18700 + }, + { + "epoch": 1.098391452389339, + "grad_norm": 4.0878801345825195, + "learning_rate": 4.250912998136307e-05, + "loss": 0.008, + "step": 18710 + }, + { + "epoch": 1.098978513561113, + "grad_norm": 1.1421128511428833, + "learning_rate": 4.246341169432994e-05, + "loss": 0.0169, + "step": 18720 + }, + { + "epoch": 1.0995655747328872, + "grad_norm": 0.7710453867912292, + "learning_rate": 4.241769985395365e-05, + "loss": 0.03, + "step": 18730 + }, + { + "epoch": 1.1001526359046612, + "grad_norm": 1.7933598756790161, + "learning_rate": 4.2371994499335264e-05, + "loss": 0.0208, + "step": 18740 + }, + { + "epoch": 1.1007396970764354, + "grad_norm": 2.1244444847106934, + "learning_rate": 4.232629566957026e-05, + "loss": 0.0295, + "step": 18750 + }, + { + "epoch": 1.1013267582482094, + "grad_norm": 1.4720721244812012, + "learning_rate": 4.2280603403748606e-05, + "loss": 0.0263, + "step": 18760 + }, + { + "epoch": 1.1019138194199836, + "grad_norm": 2.264559507369995, + "learning_rate": 4.223491774095455e-05, + "loss": 0.0185, + "step": 18770 + }, + { + "epoch": 1.1025008805917578, + "grad_norm": 0.14515529572963715, + "learning_rate": 4.2189238720266826e-05, + "loss": 0.0103, + "step": 18780 + }, + { + "epoch": 1.1030879417635318, + "grad_norm": 0.01907486841082573, + "learning_rate": 4.214356638075836e-05, + "loss": 0.006, + "step": 18790 + }, + { + "epoch": 1.103675002935306, + "grad_norm": 0.2551787197589874, + "learning_rate": 4.2097900761496445e-05, + "loss": 0.0114, + "step": 18800 + }, + { + "epoch": 1.10426206410708, + "grad_norm": 0.6972945332527161, + "learning_rate": 4.2052241901542576e-05, + "loss": 0.0263, + "step": 18810 + }, + { + "epoch": 1.1048491252788541, + "grad_norm": 0.8127758502960205, + "learning_rate": 4.2006589839952526e-05, + "loss": 0.0047, + "step": 18820 + }, + { + "epoch": 1.1054361864506281, + "grad_norm": 0.3596905767917633, + "learning_rate": 4.1960944615776175e-05, + "loss": 0.0083, + "step": 18830 + }, + { + "epoch": 1.1060232476224023, + "grad_norm": 0.02701782062649727, + "learning_rate": 4.191530626805762e-05, + "loss": 0.0136, + "step": 18840 + }, + { + "epoch": 1.1066103087941763, + "grad_norm": 1.7618811130523682, + "learning_rate": 4.186967483583505e-05, + "loss": 0.0181, + "step": 18850 + }, + { + "epoch": 1.1071973699659505, + "grad_norm": 1.2107560634613037, + "learning_rate": 4.1824050358140724e-05, + "loss": 0.0091, + "step": 18860 + }, + { + "epoch": 1.1077844311377245, + "grad_norm": 0.12104026973247528, + "learning_rate": 4.1778432874001006e-05, + "loss": 0.0155, + "step": 18870 + }, + { + "epoch": 1.1083714923094987, + "grad_norm": 0.543029248714447, + "learning_rate": 4.173282242243618e-05, + "loss": 0.0237, + "step": 18880 + }, + { + "epoch": 1.1089585534812727, + "grad_norm": 0.8624221086502075, + "learning_rate": 4.168721904246063e-05, + "loss": 0.0101, + "step": 18890 + }, + { + "epoch": 1.109545614653047, + "grad_norm": 2.7473998069763184, + "learning_rate": 4.164162277308259e-05, + "loss": 0.0217, + "step": 18900 + }, + { + "epoch": 1.110132675824821, + "grad_norm": 2.060375928878784, + "learning_rate": 4.15960336533043e-05, + "loss": 0.0106, + "step": 18910 + }, + { + "epoch": 1.110719736996595, + "grad_norm": 0.04229264706373215, + "learning_rate": 4.1550451722121806e-05, + "loss": 0.013, + "step": 18920 + }, + { + "epoch": 1.111306798168369, + "grad_norm": 1.526538372039795, + "learning_rate": 4.1504877018525065e-05, + "loss": 0.0388, + "step": 18930 + }, + { + "epoch": 1.1118938593401433, + "grad_norm": 1.4976556301116943, + "learning_rate": 4.14593095814978e-05, + "loss": 0.0176, + "step": 18940 + }, + { + "epoch": 1.1124809205119173, + "grad_norm": 0.9924315214157104, + "learning_rate": 4.141374945001758e-05, + "loss": 0.0298, + "step": 18950 + }, + { + "epoch": 1.1130679816836915, + "grad_norm": 0.9775465130805969, + "learning_rate": 4.136819666305566e-05, + "loss": 0.0219, + "step": 18960 + }, + { + "epoch": 1.1136550428554655, + "grad_norm": 5.021305084228516, + "learning_rate": 4.1322651259577064e-05, + "loss": 0.0167, + "step": 18970 + }, + { + "epoch": 1.1142421040272397, + "grad_norm": 1.727529525756836, + "learning_rate": 4.1277113278540456e-05, + "loss": 0.0064, + "step": 18980 + }, + { + "epoch": 1.1148291651990136, + "grad_norm": 1.548048973083496, + "learning_rate": 4.123158275889819e-05, + "loss": 0.0134, + "step": 18990 + }, + { + "epoch": 1.1154162263707879, + "grad_norm": 3.1047353744506836, + "learning_rate": 4.118605973959623e-05, + "loss": 0.0128, + "step": 19000 + }, + { + "epoch": 1.1160032875425618, + "grad_norm": 0.6265288591384888, + "learning_rate": 4.11405442595741e-05, + "loss": 0.0108, + "step": 19010 + }, + { + "epoch": 1.116590348714336, + "grad_norm": 1.5217883586883545, + "learning_rate": 4.1095036357764915e-05, + "loss": 0.0109, + "step": 19020 + }, + { + "epoch": 1.11717740988611, + "grad_norm": 0.46668195724487305, + "learning_rate": 4.104953607309524e-05, + "loss": 0.0198, + "step": 19030 + }, + { + "epoch": 1.1177644710578842, + "grad_norm": 1.9134923219680786, + "learning_rate": 4.100404344448522e-05, + "loss": 0.0127, + "step": 19040 + }, + { + "epoch": 1.1183515322296582, + "grad_norm": 0.603248119354248, + "learning_rate": 4.095855851084836e-05, + "loss": 0.0154, + "step": 19050 + }, + { + "epoch": 1.1189385934014324, + "grad_norm": 1.7319767475128174, + "learning_rate": 4.091308131109165e-05, + "loss": 0.0137, + "step": 19060 + }, + { + "epoch": 1.1195256545732066, + "grad_norm": 0.5494726896286011, + "learning_rate": 4.086761188411541e-05, + "loss": 0.0087, + "step": 19070 + }, + { + "epoch": 1.1201127157449806, + "grad_norm": 0.5562161207199097, + "learning_rate": 4.082215026881337e-05, + "loss": 0.0085, + "step": 19080 + }, + { + "epoch": 1.1206997769167548, + "grad_norm": 1.4518531560897827, + "learning_rate": 4.0776696504072506e-05, + "loss": 0.0089, + "step": 19090 + }, + { + "epoch": 1.1212868380885288, + "grad_norm": 0.7333649396896362, + "learning_rate": 4.073125062877317e-05, + "loss": 0.014, + "step": 19100 + }, + { + "epoch": 1.121873899260303, + "grad_norm": 0.05590398982167244, + "learning_rate": 4.068581268178886e-05, + "loss": 0.0162, + "step": 19110 + }, + { + "epoch": 1.122460960432077, + "grad_norm": 0.5153812170028687, + "learning_rate": 4.064038270198638e-05, + "loss": 0.0231, + "step": 19120 + }, + { + "epoch": 1.1230480216038512, + "grad_norm": 0.26842015981674194, + "learning_rate": 4.05949607282257e-05, + "loss": 0.0228, + "step": 19130 + }, + { + "epoch": 1.1236350827756252, + "grad_norm": 0.20137840509414673, + "learning_rate": 4.054954679935988e-05, + "loss": 0.0166, + "step": 19140 + }, + { + "epoch": 1.1242221439473994, + "grad_norm": 0.9210729002952576, + "learning_rate": 4.05041409542352e-05, + "loss": 0.0129, + "step": 19150 + }, + { + "epoch": 1.1248092051191734, + "grad_norm": 0.48231643438339233, + "learning_rate": 4.0458743231690925e-05, + "loss": 0.0107, + "step": 19160 + }, + { + "epoch": 1.1253962662909476, + "grad_norm": 4.762882232666016, + "learning_rate": 4.041335367055945e-05, + "loss": 0.0263, + "step": 19170 + }, + { + "epoch": 1.1259833274627216, + "grad_norm": 3.452972650527954, + "learning_rate": 4.0367972309666145e-05, + "loss": 0.0136, + "step": 19180 + }, + { + "epoch": 1.1265703886344958, + "grad_norm": 0.3142324388027191, + "learning_rate": 4.03225991878294e-05, + "loss": 0.0109, + "step": 19190 + }, + { + "epoch": 1.1271574498062698, + "grad_norm": 0.8441796898841858, + "learning_rate": 4.027723434386049e-05, + "loss": 0.0143, + "step": 19200 + }, + { + "epoch": 1.127744510978044, + "grad_norm": 2.2673232555389404, + "learning_rate": 4.0231877816563695e-05, + "loss": 0.0229, + "step": 19210 + }, + { + "epoch": 1.128331572149818, + "grad_norm": 0.729567289352417, + "learning_rate": 4.0186529644736114e-05, + "loss": 0.015, + "step": 19220 + }, + { + "epoch": 1.1289186333215921, + "grad_norm": 0.5330530405044556, + "learning_rate": 4.014118986716776e-05, + "loss": 0.0157, + "step": 19230 + }, + { + "epoch": 1.1295056944933661, + "grad_norm": 0.33814314007759094, + "learning_rate": 4.0095858522641394e-05, + "loss": 0.0214, + "step": 19240 + }, + { + "epoch": 1.1300927556651403, + "grad_norm": 2.5041515827178955, + "learning_rate": 4.005053564993261e-05, + "loss": 0.0258, + "step": 19250 + }, + { + "epoch": 1.1306798168369143, + "grad_norm": 0.012002293951809406, + "learning_rate": 4.000522128780978e-05, + "loss": 0.0186, + "step": 19260 + }, + { + "epoch": 1.1312668780086885, + "grad_norm": 0.7148932218551636, + "learning_rate": 3.995991547503392e-05, + "loss": 0.0102, + "step": 19270 + }, + { + "epoch": 1.1318539391804625, + "grad_norm": 2.050722122192383, + "learning_rate": 3.991461825035882e-05, + "loss": 0.0107, + "step": 19280 + }, + { + "epoch": 1.1324410003522367, + "grad_norm": 0.2507849633693695, + "learning_rate": 3.986932965253081e-05, + "loss": 0.0095, + "step": 19290 + }, + { + "epoch": 1.1330280615240107, + "grad_norm": 1.851148009300232, + "learning_rate": 3.9824049720289e-05, + "loss": 0.0164, + "step": 19300 + }, + { + "epoch": 1.133615122695785, + "grad_norm": 1.9028648138046265, + "learning_rate": 3.9778778492364924e-05, + "loss": 0.0159, + "step": 19310 + }, + { + "epoch": 1.134202183867559, + "grad_norm": 1.4246652126312256, + "learning_rate": 3.973351600748278e-05, + "loss": 0.0145, + "step": 19320 + }, + { + "epoch": 1.134789245039333, + "grad_norm": 0.08715999126434326, + "learning_rate": 3.968826230435923e-05, + "loss": 0.0141, + "step": 19330 + }, + { + "epoch": 1.135376306211107, + "grad_norm": 2.2939882278442383, + "learning_rate": 3.964301742170349e-05, + "loss": 0.0158, + "step": 19340 + }, + { + "epoch": 1.1359633673828813, + "grad_norm": 0.3845359683036804, + "learning_rate": 3.9597781398217135e-05, + "loss": 0.0194, + "step": 19350 + }, + { + "epoch": 1.1365504285546555, + "grad_norm": 0.37646591663360596, + "learning_rate": 3.9552554272594256e-05, + "loss": 0.0114, + "step": 19360 + }, + { + "epoch": 1.1371374897264295, + "grad_norm": 1.6800992488861084, + "learning_rate": 3.9507336083521256e-05, + "loss": 0.0188, + "step": 19370 + }, + { + "epoch": 1.1377245508982037, + "grad_norm": 2.078392267227173, + "learning_rate": 3.946212686967696e-05, + "loss": 0.0309, + "step": 19380 + }, + { + "epoch": 1.1383116120699777, + "grad_norm": 4.602158546447754, + "learning_rate": 3.9416926669732454e-05, + "loss": 0.0238, + "step": 19390 + }, + { + "epoch": 1.1388986732417519, + "grad_norm": 2.036029577255249, + "learning_rate": 3.937173552235117e-05, + "loss": 0.0252, + "step": 19400 + }, + { + "epoch": 1.1394857344135259, + "grad_norm": 1.749767541885376, + "learning_rate": 3.932655346618876e-05, + "loss": 0.013, + "step": 19410 + }, + { + "epoch": 1.1400727955853, + "grad_norm": 1.1671046018600464, + "learning_rate": 3.9281380539893114e-05, + "loss": 0.031, + "step": 19420 + }, + { + "epoch": 1.140659856757074, + "grad_norm": 0.6031616926193237, + "learning_rate": 3.923621678210432e-05, + "loss": 0.0217, + "step": 19430 + }, + { + "epoch": 1.1412469179288482, + "grad_norm": 2.466723680496216, + "learning_rate": 3.9191062231454586e-05, + "loss": 0.0246, + "step": 19440 + }, + { + "epoch": 1.1418339791006222, + "grad_norm": 1.7094677686691284, + "learning_rate": 3.914591692656831e-05, + "loss": 0.0166, + "step": 19450 + }, + { + "epoch": 1.1424210402723964, + "grad_norm": 7.691342830657959, + "learning_rate": 3.9100780906061896e-05, + "loss": 0.0207, + "step": 19460 + }, + { + "epoch": 1.1430081014441704, + "grad_norm": 2.8758277893066406, + "learning_rate": 3.905565420854388e-05, + "loss": 0.0282, + "step": 19470 + }, + { + "epoch": 1.1435951626159446, + "grad_norm": 1.252706527709961, + "learning_rate": 3.901053687261479e-05, + "loss": 0.0086, + "step": 19480 + }, + { + "epoch": 1.1441822237877186, + "grad_norm": 0.2504454553127289, + "learning_rate": 3.896542893686716e-05, + "loss": 0.0151, + "step": 19490 + }, + { + "epoch": 1.1447692849594928, + "grad_norm": 1.5650849342346191, + "learning_rate": 3.892033043988547e-05, + "loss": 0.0175, + "step": 19500 + }, + { + "epoch": 1.1453563461312668, + "grad_norm": 0.6675151586532593, + "learning_rate": 3.887524142024614e-05, + "loss": 0.0259, + "step": 19510 + }, + { + "epoch": 1.145943407303041, + "grad_norm": 0.17080461978912354, + "learning_rate": 3.883016191651744e-05, + "loss": 0.0042, + "step": 19520 + }, + { + "epoch": 1.146530468474815, + "grad_norm": 2.4723546504974365, + "learning_rate": 3.878509196725957e-05, + "loss": 0.0171, + "step": 19530 + }, + { + "epoch": 1.1471175296465892, + "grad_norm": 0.2739218473434448, + "learning_rate": 3.874003161102453e-05, + "loss": 0.0092, + "step": 19540 + }, + { + "epoch": 1.1477045908183632, + "grad_norm": 0.8519219756126404, + "learning_rate": 3.869498088635608e-05, + "loss": 0.0179, + "step": 19550 + }, + { + "epoch": 1.1482916519901374, + "grad_norm": 1.4947580099105835, + "learning_rate": 3.864993983178978e-05, + "loss": 0.0196, + "step": 19560 + }, + { + "epoch": 1.1488787131619116, + "grad_norm": 0.4917875826358795, + "learning_rate": 3.860490848585291e-05, + "loss": 0.0158, + "step": 19570 + }, + { + "epoch": 1.1494657743336856, + "grad_norm": 3.007902145385742, + "learning_rate": 3.8559886887064434e-05, + "loss": 0.0156, + "step": 19580 + }, + { + "epoch": 1.1500528355054596, + "grad_norm": 0.5176740288734436, + "learning_rate": 3.851487507393498e-05, + "loss": 0.017, + "step": 19590 + }, + { + "epoch": 1.1506398966772338, + "grad_norm": 0.7068972587585449, + "learning_rate": 3.846987308496686e-05, + "loss": 0.0112, + "step": 19600 + }, + { + "epoch": 1.151226957849008, + "grad_norm": 0.1920938789844513, + "learning_rate": 3.8424880958653855e-05, + "loss": 0.0118, + "step": 19610 + }, + { + "epoch": 1.151814019020782, + "grad_norm": 1.1475507020950317, + "learning_rate": 3.8379898733481455e-05, + "loss": 0.0177, + "step": 19620 + }, + { + "epoch": 1.1524010801925562, + "grad_norm": 0.03571141138672829, + "learning_rate": 3.8334926447926576e-05, + "loss": 0.0098, + "step": 19630 + }, + { + "epoch": 1.1529881413643301, + "grad_norm": 0.08643414825201035, + "learning_rate": 3.82899641404577e-05, + "loss": 0.004, + "step": 19640 + }, + { + "epoch": 1.1535752025361043, + "grad_norm": 0.038801468908786774, + "learning_rate": 3.8245011849534724e-05, + "loss": 0.0373, + "step": 19650 + }, + { + "epoch": 1.1541622637078783, + "grad_norm": 0.48830196261405945, + "learning_rate": 3.820006961360901e-05, + "loss": 0.0254, + "step": 19660 + }, + { + "epoch": 1.1547493248796525, + "grad_norm": 2.7519326210021973, + "learning_rate": 3.8155137471123294e-05, + "loss": 0.0173, + "step": 19670 + }, + { + "epoch": 1.1553363860514265, + "grad_norm": 0.008496723137795925, + "learning_rate": 3.8110215460511696e-05, + "loss": 0.01, + "step": 19680 + }, + { + "epoch": 1.1559234472232007, + "grad_norm": 2.1455023288726807, + "learning_rate": 3.806530362019969e-05, + "loss": 0.0092, + "step": 19690 + }, + { + "epoch": 1.1565105083949747, + "grad_norm": 1.0149089097976685, + "learning_rate": 3.802040198860397e-05, + "loss": 0.0043, + "step": 19700 + }, + { + "epoch": 1.157097569566749, + "grad_norm": 3.4099552631378174, + "learning_rate": 3.7975510604132626e-05, + "loss": 0.0148, + "step": 19710 + }, + { + "epoch": 1.157684630738523, + "grad_norm": 0.7367317080497742, + "learning_rate": 3.793062950518484e-05, + "loss": 0.0236, + "step": 19720 + }, + { + "epoch": 1.158271691910297, + "grad_norm": 0.44443872570991516, + "learning_rate": 3.788575873015111e-05, + "loss": 0.0103, + "step": 19730 + }, + { + "epoch": 1.158858753082071, + "grad_norm": 2.0920252799987793, + "learning_rate": 3.7840898317413034e-05, + "loss": 0.0193, + "step": 19740 + }, + { + "epoch": 1.1594458142538453, + "grad_norm": 1.7599064111709595, + "learning_rate": 3.7796048305343383e-05, + "loss": 0.035, + "step": 19750 + }, + { + "epoch": 1.1600328754256193, + "grad_norm": 0.644555926322937, + "learning_rate": 3.7751208732306015e-05, + "loss": 0.0139, + "step": 19760 + }, + { + "epoch": 1.1606199365973935, + "grad_norm": 0.03693939745426178, + "learning_rate": 3.770637963665589e-05, + "loss": 0.0046, + "step": 19770 + }, + { + "epoch": 1.1612069977691675, + "grad_norm": 0.42754417657852173, + "learning_rate": 3.766156105673891e-05, + "loss": 0.0121, + "step": 19780 + }, + { + "epoch": 1.1617940589409417, + "grad_norm": 0.2325069159269333, + "learning_rate": 3.761675303089213e-05, + "loss": 0.0165, + "step": 19790 + }, + { + "epoch": 1.1623811201127157, + "grad_norm": 2.635518789291382, + "learning_rate": 3.757195559744345e-05, + "loss": 0.0196, + "step": 19800 + }, + { + "epoch": 1.1629681812844899, + "grad_norm": 0.5041676759719849, + "learning_rate": 3.7527168794711764e-05, + "loss": 0.0189, + "step": 19810 + }, + { + "epoch": 1.1635552424562638, + "grad_norm": 2.850172281265259, + "learning_rate": 3.748239266100689e-05, + "loss": 0.0192, + "step": 19820 + }, + { + "epoch": 1.164142303628038, + "grad_norm": 0.2840389907360077, + "learning_rate": 3.7437627234629464e-05, + "loss": 0.0224, + "step": 19830 + }, + { + "epoch": 1.164729364799812, + "grad_norm": 0.43667224049568176, + "learning_rate": 3.7392872553871025e-05, + "loss": 0.0141, + "step": 19840 + }, + { + "epoch": 1.1653164259715862, + "grad_norm": 2.1170408725738525, + "learning_rate": 3.7348128657013864e-05, + "loss": 0.0112, + "step": 19850 + }, + { + "epoch": 1.1659034871433605, + "grad_norm": 2.672410249710083, + "learning_rate": 3.730339558233111e-05, + "loss": 0.01, + "step": 19860 + }, + { + "epoch": 1.1664905483151344, + "grad_norm": 2.02648663520813, + "learning_rate": 3.7258673368086545e-05, + "loss": 0.0269, + "step": 19870 + }, + { + "epoch": 1.1670776094869084, + "grad_norm": 1.8439985513687134, + "learning_rate": 3.721396205253478e-05, + "loss": 0.0119, + "step": 19880 + }, + { + "epoch": 1.1676646706586826, + "grad_norm": 0.4652295410633087, + "learning_rate": 3.716926167392098e-05, + "loss": 0.0123, + "step": 19890 + }, + { + "epoch": 1.1682517318304568, + "grad_norm": 2.3301098346710205, + "learning_rate": 3.7124572270481056e-05, + "loss": 0.0155, + "step": 19900 + }, + { + "epoch": 1.1688387930022308, + "grad_norm": 4.59347677230835, + "learning_rate": 3.707989388044146e-05, + "loss": 0.0175, + "step": 19910 + }, + { + "epoch": 1.169425854174005, + "grad_norm": 0.5529350638389587, + "learning_rate": 3.7035226542019275e-05, + "loss": 0.0174, + "step": 19920 + }, + { + "epoch": 1.170012915345779, + "grad_norm": 2.0525143146514893, + "learning_rate": 3.699057029342209e-05, + "loss": 0.0401, + "step": 19930 + }, + { + "epoch": 1.1705999765175532, + "grad_norm": 0.05721009150147438, + "learning_rate": 3.6945925172848054e-05, + "loss": 0.0107, + "step": 19940 + }, + { + "epoch": 1.1711870376893272, + "grad_norm": 1.8482041358947754, + "learning_rate": 3.6901291218485725e-05, + "loss": 0.0263, + "step": 19950 + }, + { + "epoch": 1.1717740988611014, + "grad_norm": 2.0639731884002686, + "learning_rate": 3.685666846851417e-05, + "loss": 0.0158, + "step": 19960 + }, + { + "epoch": 1.1723611600328754, + "grad_norm": 1.4494773149490356, + "learning_rate": 3.6812056961102894e-05, + "loss": 0.0161, + "step": 19970 + }, + { + "epoch": 1.1729482212046496, + "grad_norm": 0.05280623957514763, + "learning_rate": 3.67674567344117e-05, + "loss": 0.0091, + "step": 19980 + }, + { + "epoch": 1.1735352823764236, + "grad_norm": 0.8740899562835693, + "learning_rate": 3.672286782659081e-05, + "loss": 0.0123, + "step": 19990 + }, + { + "epoch": 1.1741223435481978, + "grad_norm": 1.8102203607559204, + "learning_rate": 3.6678290275780724e-05, + "loss": 0.0191, + "step": 20000 + }, + { + "epoch": 1.1747094047199718, + "grad_norm": 1.0259355306625366, + "learning_rate": 3.6633724120112274e-05, + "loss": 0.0257, + "step": 20010 + }, + { + "epoch": 1.175296465891746, + "grad_norm": 1.0387524366378784, + "learning_rate": 3.658916939770649e-05, + "loss": 0.019, + "step": 20020 + }, + { + "epoch": 1.17588352706352, + "grad_norm": 0.5713001489639282, + "learning_rate": 3.6544626146674685e-05, + "loss": 0.0115, + "step": 20030 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 0.6265963912010193, + "learning_rate": 3.650009440511828e-05, + "loss": 0.0149, + "step": 20040 + }, + { + "epoch": 1.1770576494070681, + "grad_norm": 0.13561226427555084, + "learning_rate": 3.645557421112893e-05, + "loss": 0.0274, + "step": 20050 + }, + { + "epoch": 1.1776447105788423, + "grad_norm": 2.2962963581085205, + "learning_rate": 3.641106560278834e-05, + "loss": 0.0159, + "step": 20060 + }, + { + "epoch": 1.1782317717506163, + "grad_norm": 2.1667020320892334, + "learning_rate": 3.636656861816838e-05, + "loss": 0.0137, + "step": 20070 + }, + { + "epoch": 1.1788188329223905, + "grad_norm": 2.4629743099212646, + "learning_rate": 3.632208329533092e-05, + "loss": 0.0284, + "step": 20080 + }, + { + "epoch": 1.1794058940941645, + "grad_norm": 1.6178267002105713, + "learning_rate": 3.627760967232788e-05, + "loss": 0.0083, + "step": 20090 + }, + { + "epoch": 1.1799929552659387, + "grad_norm": 1.4193296432495117, + "learning_rate": 3.6233147787201175e-05, + "loss": 0.0068, + "step": 20100 + }, + { + "epoch": 1.180580016437713, + "grad_norm": 0.6253151893615723, + "learning_rate": 3.618869767798263e-05, + "loss": 0.0141, + "step": 20110 + }, + { + "epoch": 1.181167077609487, + "grad_norm": 3.456434726715088, + "learning_rate": 3.6144259382694114e-05, + "loss": 0.0264, + "step": 20120 + }, + { + "epoch": 1.181754138781261, + "grad_norm": 0.8652618527412415, + "learning_rate": 3.6099832939347237e-05, + "loss": 0.0113, + "step": 20130 + }, + { + "epoch": 1.182341199953035, + "grad_norm": 2.624691963195801, + "learning_rate": 3.605541838594359e-05, + "loss": 0.0165, + "step": 20140 + }, + { + "epoch": 1.1829282611248093, + "grad_norm": 0.5651686191558838, + "learning_rate": 3.6011015760474534e-05, + "loss": 0.003, + "step": 20150 + }, + { + "epoch": 1.1835153222965833, + "grad_norm": 0.3492438793182373, + "learning_rate": 3.596662510092126e-05, + "loss": 0.0137, + "step": 20160 + }, + { + "epoch": 1.1841023834683573, + "grad_norm": 1.9067304134368896, + "learning_rate": 3.5922246445254706e-05, + "loss": 0.0134, + "step": 20170 + }, + { + "epoch": 1.1846894446401315, + "grad_norm": 0.31494903564453125, + "learning_rate": 3.587787983143554e-05, + "loss": 0.0054, + "step": 20180 + }, + { + "epoch": 1.1852765058119057, + "grad_norm": 0.13532504439353943, + "learning_rate": 3.583352529741413e-05, + "loss": 0.0109, + "step": 20190 + }, + { + "epoch": 1.1858635669836797, + "grad_norm": 1.6830731630325317, + "learning_rate": 3.578918288113055e-05, + "loss": 0.014, + "step": 20200 + }, + { + "epoch": 1.1864506281554539, + "grad_norm": 0.5527680516242981, + "learning_rate": 3.5744852620514415e-05, + "loss": 0.0125, + "step": 20210 + }, + { + "epoch": 1.1870376893272279, + "grad_norm": 1.511231541633606, + "learning_rate": 3.570053455348502e-05, + "loss": 0.0137, + "step": 20220 + }, + { + "epoch": 1.187624750499002, + "grad_norm": 0.7616772055625916, + "learning_rate": 3.565622871795127e-05, + "loss": 0.0256, + "step": 20230 + }, + { + "epoch": 1.188211811670776, + "grad_norm": 2.778029203414917, + "learning_rate": 3.561193515181147e-05, + "loss": 0.0195, + "step": 20240 + }, + { + "epoch": 1.1887988728425503, + "grad_norm": 2.1403579711914062, + "learning_rate": 3.5567653892953564e-05, + "loss": 0.0198, + "step": 20250 + }, + { + "epoch": 1.1893859340143242, + "grad_norm": 0.7490493655204773, + "learning_rate": 3.552338497925488e-05, + "loss": 0.0281, + "step": 20260 + }, + { + "epoch": 1.1899729951860984, + "grad_norm": 1.7620073556900024, + "learning_rate": 3.5479128448582246e-05, + "loss": 0.0113, + "step": 20270 + }, + { + "epoch": 1.1905600563578724, + "grad_norm": 2.1982274055480957, + "learning_rate": 3.543488433879184e-05, + "loss": 0.018, + "step": 20280 + }, + { + "epoch": 1.1911471175296466, + "grad_norm": 1.6664766073226929, + "learning_rate": 3.539065268772929e-05, + "loss": 0.0135, + "step": 20290 + }, + { + "epoch": 1.1917341787014206, + "grad_norm": 4.160362243652344, + "learning_rate": 3.5346433533229474e-05, + "loss": 0.0184, + "step": 20300 + }, + { + "epoch": 1.1923212398731948, + "grad_norm": 0.6458218693733215, + "learning_rate": 3.530222691311666e-05, + "loss": 0.0259, + "step": 20310 + }, + { + "epoch": 1.1929083010449688, + "grad_norm": 1.137535810470581, + "learning_rate": 3.525803286520437e-05, + "loss": 0.0142, + "step": 20320 + }, + { + "epoch": 1.193495362216743, + "grad_norm": 0.21634048223495483, + "learning_rate": 3.521385142729535e-05, + "loss": 0.0139, + "step": 20330 + }, + { + "epoch": 1.194082423388517, + "grad_norm": 4.346113204956055, + "learning_rate": 3.516968263718159e-05, + "loss": 0.0234, + "step": 20340 + }, + { + "epoch": 1.1946694845602912, + "grad_norm": 1.4190869331359863, + "learning_rate": 3.512552653264425e-05, + "loss": 0.0157, + "step": 20350 + }, + { + "epoch": 1.1952565457320652, + "grad_norm": 1.1790575981140137, + "learning_rate": 3.5081383151453604e-05, + "loss": 0.0126, + "step": 20360 + }, + { + "epoch": 1.1958436069038394, + "grad_norm": 0.6182461977005005, + "learning_rate": 3.5037252531369104e-05, + "loss": 0.0161, + "step": 20370 + }, + { + "epoch": 1.1964306680756134, + "grad_norm": 1.2333548069000244, + "learning_rate": 3.499313471013928e-05, + "loss": 0.0134, + "step": 20380 + }, + { + "epoch": 1.1970177292473876, + "grad_norm": 0.6007595658302307, + "learning_rate": 3.494902972550165e-05, + "loss": 0.0224, + "step": 20390 + }, + { + "epoch": 1.1976047904191618, + "grad_norm": 1.6859315633773804, + "learning_rate": 3.490493761518281e-05, + "loss": 0.018, + "step": 20400 + }, + { + "epoch": 1.1981918515909358, + "grad_norm": 0.025474315509200096, + "learning_rate": 3.486085841689832e-05, + "loss": 0.0114, + "step": 20410 + }, + { + "epoch": 1.1987789127627098, + "grad_norm": 1.8451062440872192, + "learning_rate": 3.481679216835273e-05, + "loss": 0.0178, + "step": 20420 + }, + { + "epoch": 1.199365973934484, + "grad_norm": 2.9829752445220947, + "learning_rate": 3.477273890723944e-05, + "loss": 0.0374, + "step": 20430 + }, + { + "epoch": 1.1999530351062582, + "grad_norm": 0.15790514647960663, + "learning_rate": 3.4728698671240854e-05, + "loss": 0.0233, + "step": 20440 + }, + { + "epoch": 1.2005400962780322, + "grad_norm": 0.18786346912384033, + "learning_rate": 3.468467149802808e-05, + "loss": 0.0217, + "step": 20450 + }, + { + "epoch": 1.2011271574498064, + "grad_norm": 2.4212985038757324, + "learning_rate": 3.4640657425261224e-05, + "loss": 0.0141, + "step": 20460 + }, + { + "epoch": 1.2017142186215803, + "grad_norm": 1.8529967069625854, + "learning_rate": 3.459665649058904e-05, + "loss": 0.0146, + "step": 20470 + }, + { + "epoch": 1.2023012797933545, + "grad_norm": 0.6335251927375793, + "learning_rate": 3.455266873164914e-05, + "loss": 0.02, + "step": 20480 + }, + { + "epoch": 1.2028883409651285, + "grad_norm": 2.4057374000549316, + "learning_rate": 3.45086941860678e-05, + "loss": 0.0341, + "step": 20490 + }, + { + "epoch": 1.2034754021369027, + "grad_norm": 2.200655937194824, + "learning_rate": 3.446473289146006e-05, + "loss": 0.0188, + "step": 20500 + }, + { + "epoch": 1.2040624633086767, + "grad_norm": 0.0019765030592679977, + "learning_rate": 3.442078488542957e-05, + "loss": 0.0069, + "step": 20510 + }, + { + "epoch": 1.204649524480451, + "grad_norm": 0.9872438311576843, + "learning_rate": 3.437685020556864e-05, + "loss": 0.0168, + "step": 20520 + }, + { + "epoch": 1.205236585652225, + "grad_norm": 1.3100894689559937, + "learning_rate": 3.433292888945818e-05, + "loss": 0.0077, + "step": 20530 + }, + { + "epoch": 1.2058236468239991, + "grad_norm": 1.1229948997497559, + "learning_rate": 3.428902097466764e-05, + "loss": 0.0103, + "step": 20540 + }, + { + "epoch": 1.206410707995773, + "grad_norm": 1.789831280708313, + "learning_rate": 3.424512649875506e-05, + "loss": 0.0126, + "step": 20550 + }, + { + "epoch": 1.2069977691675473, + "grad_norm": 0.010228688828647137, + "learning_rate": 3.420124549926693e-05, + "loss": 0.016, + "step": 20560 + }, + { + "epoch": 1.2075848303393213, + "grad_norm": 1.4194481372833252, + "learning_rate": 3.4157378013738264e-05, + "loss": 0.0193, + "step": 20570 + }, + { + "epoch": 1.2081718915110955, + "grad_norm": 2.608931303024292, + "learning_rate": 3.411352407969245e-05, + "loss": 0.0083, + "step": 20580 + }, + { + "epoch": 1.2087589526828695, + "grad_norm": 2.1073362827301025, + "learning_rate": 3.406968373464137e-05, + "loss": 0.0138, + "step": 20590 + }, + { + "epoch": 1.2093460138546437, + "grad_norm": 1.5467884540557861, + "learning_rate": 3.402585701608519e-05, + "loss": 0.0229, + "step": 20600 + }, + { + "epoch": 1.2099330750264177, + "grad_norm": 0.020903684198856354, + "learning_rate": 3.398204396151251e-05, + "loss": 0.0077, + "step": 20610 + }, + { + "epoch": 1.2105201361981919, + "grad_norm": 0.7192597985267639, + "learning_rate": 3.3938244608400164e-05, + "loss": 0.0156, + "step": 20620 + }, + { + "epoch": 1.2111071973699659, + "grad_norm": 0.054821472615003586, + "learning_rate": 3.389445899421332e-05, + "loss": 0.0069, + "step": 20630 + }, + { + "epoch": 1.21169425854174, + "grad_norm": 2.3048887252807617, + "learning_rate": 3.385068715640536e-05, + "loss": 0.0111, + "step": 20640 + }, + { + "epoch": 1.2122813197135143, + "grad_norm": 0.7605149149894714, + "learning_rate": 3.380692913241791e-05, + "loss": 0.0277, + "step": 20650 + }, + { + "epoch": 1.2128683808852883, + "grad_norm": 1.7066093683242798, + "learning_rate": 3.376318495968076e-05, + "loss": 0.0124, + "step": 20660 + }, + { + "epoch": 1.2134554420570622, + "grad_norm": 1.971728801727295, + "learning_rate": 3.371945467561186e-05, + "loss": 0.0226, + "step": 20670 + }, + { + "epoch": 1.2140425032288364, + "grad_norm": 0.8244488835334778, + "learning_rate": 3.367573831761728e-05, + "loss": 0.0118, + "step": 20680 + }, + { + "epoch": 1.2146295644006107, + "grad_norm": 0.2647351324558258, + "learning_rate": 3.363203592309117e-05, + "loss": 0.0118, + "step": 20690 + }, + { + "epoch": 1.2152166255723846, + "grad_norm": 5.359185218811035, + "learning_rate": 3.358834752941576e-05, + "loss": 0.0209, + "step": 20700 + }, + { + "epoch": 1.2158036867441586, + "grad_norm": 2.0620193481445312, + "learning_rate": 3.354467317396124e-05, + "loss": 0.0158, + "step": 20710 + }, + { + "epoch": 1.2163907479159328, + "grad_norm": 0.27595534920692444, + "learning_rate": 3.35010128940859e-05, + "loss": 0.018, + "step": 20720 + }, + { + "epoch": 1.216977809087707, + "grad_norm": 0.4861520528793335, + "learning_rate": 3.345736672713588e-05, + "loss": 0.0154, + "step": 20730 + }, + { + "epoch": 1.217564870259481, + "grad_norm": 5.281528949737549, + "learning_rate": 3.341373471044531e-05, + "loss": 0.0187, + "step": 20740 + }, + { + "epoch": 1.2181519314312552, + "grad_norm": 2.6023237705230713, + "learning_rate": 3.33701168813362e-05, + "loss": 0.016, + "step": 20750 + }, + { + "epoch": 1.2187389926030292, + "grad_norm": 1.3135600090026855, + "learning_rate": 3.3326513277118446e-05, + "loss": 0.0141, + "step": 20760 + }, + { + "epoch": 1.2193260537748034, + "grad_norm": 1.202734112739563, + "learning_rate": 3.328292393508972e-05, + "loss": 0.0191, + "step": 20770 + }, + { + "epoch": 1.2199131149465774, + "grad_norm": 1.1474391222000122, + "learning_rate": 3.323934889253556e-05, + "loss": 0.0163, + "step": 20780 + }, + { + "epoch": 1.2205001761183516, + "grad_norm": 3.3902037143707275, + "learning_rate": 3.3195788186729245e-05, + "loss": 0.0087, + "step": 20790 + }, + { + "epoch": 1.2210872372901256, + "grad_norm": 1.333632230758667, + "learning_rate": 3.315224185493176e-05, + "loss": 0.0159, + "step": 20800 + }, + { + "epoch": 1.2216742984618998, + "grad_norm": 0.49751606583595276, + "learning_rate": 3.310870993439187e-05, + "loss": 0.0128, + "step": 20810 + }, + { + "epoch": 1.2222613596336738, + "grad_norm": 0.791757345199585, + "learning_rate": 3.3065192462345915e-05, + "loss": 0.0151, + "step": 20820 + }, + { + "epoch": 1.222848420805448, + "grad_norm": 0.08929609507322311, + "learning_rate": 3.302168947601797e-05, + "loss": 0.0184, + "step": 20830 + }, + { + "epoch": 1.223435481977222, + "grad_norm": 0.20553001761436462, + "learning_rate": 3.297820101261964e-05, + "loss": 0.022, + "step": 20840 + }, + { + "epoch": 1.2240225431489962, + "grad_norm": 0.8915656805038452, + "learning_rate": 3.293472710935017e-05, + "loss": 0.0148, + "step": 20850 + }, + { + "epoch": 1.2246096043207702, + "grad_norm": 0.42689603567123413, + "learning_rate": 3.289126780339631e-05, + "loss": 0.0131, + "step": 20860 + }, + { + "epoch": 1.2251966654925444, + "grad_norm": 0.42955946922302246, + "learning_rate": 3.2847823131932365e-05, + "loss": 0.0217, + "step": 20870 + }, + { + "epoch": 1.2257837266643183, + "grad_norm": 0.8357498049736023, + "learning_rate": 3.280439313212006e-05, + "loss": 0.0161, + "step": 20880 + }, + { + "epoch": 1.2263707878360925, + "grad_norm": 0.27838730812072754, + "learning_rate": 3.276097784110862e-05, + "loss": 0.0203, + "step": 20890 + }, + { + "epoch": 1.2269578490078665, + "grad_norm": 0.32219842076301575, + "learning_rate": 3.271757729603467e-05, + "loss": 0.0126, + "step": 20900 + }, + { + "epoch": 1.2275449101796407, + "grad_norm": 0.9408592581748962, + "learning_rate": 3.267419153402225e-05, + "loss": 0.0143, + "step": 20910 + }, + { + "epoch": 1.2281319713514147, + "grad_norm": 1.6443122625350952, + "learning_rate": 3.2630820592182696e-05, + "loss": 0.014, + "step": 20920 + }, + { + "epoch": 1.228719032523189, + "grad_norm": 1.9469926357269287, + "learning_rate": 3.258746450761471e-05, + "loss": 0.0183, + "step": 20930 + }, + { + "epoch": 1.2293060936949631, + "grad_norm": 2.3205080032348633, + "learning_rate": 3.25441233174043e-05, + "loss": 0.0334, + "step": 20940 + }, + { + "epoch": 1.2298931548667371, + "grad_norm": 0.05783454701304436, + "learning_rate": 3.250079705862468e-05, + "loss": 0.0178, + "step": 20950 + }, + { + "epoch": 1.230480216038511, + "grad_norm": 1.6985958814620972, + "learning_rate": 3.245748576833636e-05, + "loss": 0.0067, + "step": 20960 + }, + { + "epoch": 1.2310672772102853, + "grad_norm": 0.7721903324127197, + "learning_rate": 3.241418948358696e-05, + "loss": 0.015, + "step": 20970 + }, + { + "epoch": 1.2316543383820595, + "grad_norm": 0.9906013011932373, + "learning_rate": 3.237090824141134e-05, + "loss": 0.0149, + "step": 20980 + }, + { + "epoch": 1.2322413995538335, + "grad_norm": 0.5731455683708191, + "learning_rate": 3.2327642078831466e-05, + "loss": 0.0097, + "step": 20990 + }, + { + "epoch": 1.2328284607256077, + "grad_norm": 0.218977689743042, + "learning_rate": 3.228439103285641e-05, + "loss": 0.0198, + "step": 21000 + }, + { + "epoch": 1.2328284607256077, + "eval_loss": 0.5017571449279785, + "eval_runtime": 269.8345, + "eval_samples_per_second": 3.502, + "eval_steps_per_second": 3.502, + "step": 21000 + }, + { + "epoch": 1.2334155218973817, + "grad_norm": 0.7364138960838318, + "learning_rate": 3.2241155140482294e-05, + "loss": 0.0115, + "step": 21010 + }, + { + "epoch": 1.234002583069156, + "grad_norm": 0.007412275765091181, + "learning_rate": 3.2197934438692314e-05, + "loss": 0.0148, + "step": 21020 + }, + { + "epoch": 1.2345896442409299, + "grad_norm": 0.6939229965209961, + "learning_rate": 3.2154728964456605e-05, + "loss": 0.0159, + "step": 21030 + }, + { + "epoch": 1.235176705412704, + "grad_norm": 0.28469404578208923, + "learning_rate": 3.211153875473239e-05, + "loss": 0.0129, + "step": 21040 + }, + { + "epoch": 1.235763766584478, + "grad_norm": 0.1567525714635849, + "learning_rate": 3.206836384646371e-05, + "loss": 0.0092, + "step": 21050 + }, + { + "epoch": 1.2363508277562523, + "grad_norm": 2.098961353302002, + "learning_rate": 3.202520427658159e-05, + "loss": 0.0155, + "step": 21060 + }, + { + "epoch": 1.2369378889280263, + "grad_norm": 1.6644279956817627, + "learning_rate": 3.1982060082003954e-05, + "loss": 0.0334, + "step": 21070 + }, + { + "epoch": 1.2375249500998005, + "grad_norm": 0.6723320484161377, + "learning_rate": 3.1938931299635484e-05, + "loss": 0.009, + "step": 21080 + }, + { + "epoch": 1.2381120112715744, + "grad_norm": 1.654505968093872, + "learning_rate": 3.189581796636778e-05, + "loss": 0.01, + "step": 21090 + }, + { + "epoch": 1.2386990724433486, + "grad_norm": 0.6577444672584534, + "learning_rate": 3.185272011907915e-05, + "loss": 0.0187, + "step": 21100 + }, + { + "epoch": 1.2392861336151226, + "grad_norm": 1.0867955684661865, + "learning_rate": 3.180963779463472e-05, + "loss": 0.0104, + "step": 21110 + }, + { + "epoch": 1.2398731947868968, + "grad_norm": 0.9444228410720825, + "learning_rate": 3.176657102988628e-05, + "loss": 0.0114, + "step": 21120 + }, + { + "epoch": 1.2404602559586708, + "grad_norm": 3.534825325012207, + "learning_rate": 3.1723519861672354e-05, + "loss": 0.0234, + "step": 21130 + }, + { + "epoch": 1.241047317130445, + "grad_norm": 0.574012041091919, + "learning_rate": 3.168048432681808e-05, + "loss": 0.0085, + "step": 21140 + }, + { + "epoch": 1.241634378302219, + "grad_norm": 0.6430829763412476, + "learning_rate": 3.1637464462135286e-05, + "loss": 0.013, + "step": 21150 + }, + { + "epoch": 1.2422214394739932, + "grad_norm": 0.3673301041126251, + "learning_rate": 3.159446030442232e-05, + "loss": 0.0283, + "step": 21160 + }, + { + "epoch": 1.2428085006457672, + "grad_norm": 0.6621904969215393, + "learning_rate": 3.155147189046418e-05, + "loss": 0.0124, + "step": 21170 + }, + { + "epoch": 1.2433955618175414, + "grad_norm": 0.03779918700456619, + "learning_rate": 3.1508499257032306e-05, + "loss": 0.016, + "step": 21180 + }, + { + "epoch": 1.2439826229893154, + "grad_norm": 0.7762832641601562, + "learning_rate": 3.1465542440884736e-05, + "loss": 0.0079, + "step": 21190 + }, + { + "epoch": 1.2445696841610896, + "grad_norm": 0.8843396902084351, + "learning_rate": 3.1422601478765874e-05, + "loss": 0.011, + "step": 21200 + }, + { + "epoch": 1.2451567453328636, + "grad_norm": 3.4436707496643066, + "learning_rate": 3.137967640740665e-05, + "loss": 0.0151, + "step": 21210 + }, + { + "epoch": 1.2457438065046378, + "grad_norm": 0.6071956753730774, + "learning_rate": 3.133676726352438e-05, + "loss": 0.0126, + "step": 21220 + }, + { + "epoch": 1.246330867676412, + "grad_norm": 1.4261529445648193, + "learning_rate": 3.12938740838227e-05, + "loss": 0.0172, + "step": 21230 + }, + { + "epoch": 1.246917928848186, + "grad_norm": 1.1258330345153809, + "learning_rate": 3.125099690499168e-05, + "loss": 0.0117, + "step": 21240 + }, + { + "epoch": 1.24750499001996, + "grad_norm": 0.3895932137966156, + "learning_rate": 3.120813576370763e-05, + "loss": 0.0055, + "step": 21250 + }, + { + "epoch": 1.2480920511917342, + "grad_norm": 0.2762930691242218, + "learning_rate": 3.1165290696633185e-05, + "loss": 0.0098, + "step": 21260 + }, + { + "epoch": 1.2486791123635084, + "grad_norm": 0.023398570716381073, + "learning_rate": 3.11224617404172e-05, + "loss": 0.0059, + "step": 21270 + }, + { + "epoch": 1.2492661735352824, + "grad_norm": 0.11404114961624146, + "learning_rate": 3.1079648931694796e-05, + "loss": 0.047, + "step": 21280 + }, + { + "epoch": 1.2498532347070566, + "grad_norm": 0.21508830785751343, + "learning_rate": 3.1036852307087183e-05, + "loss": 0.0171, + "step": 21290 + }, + { + "epoch": 1.2504402958788305, + "grad_norm": 3.046077251434326, + "learning_rate": 3.099407190320188e-05, + "loss": 0.0375, + "step": 21300 + }, + { + "epoch": 1.2510273570506047, + "grad_norm": 0.38301554322242737, + "learning_rate": 3.095130775663237e-05, + "loss": 0.0105, + "step": 21310 + }, + { + "epoch": 1.2516144182223787, + "grad_norm": 2.3761250972747803, + "learning_rate": 3.090855990395836e-05, + "loss": 0.0122, + "step": 21320 + }, + { + "epoch": 1.252201479394153, + "grad_norm": 0.27872663736343384, + "learning_rate": 3.086582838174551e-05, + "loss": 0.0187, + "step": 21330 + }, + { + "epoch": 1.252788540565927, + "grad_norm": 2.158463478088379, + "learning_rate": 3.082311322654562e-05, + "loss": 0.0453, + "step": 21340 + }, + { + "epoch": 1.2533756017377011, + "grad_norm": 0.7334088683128357, + "learning_rate": 3.0780414474896414e-05, + "loss": 0.0233, + "step": 21350 + }, + { + "epoch": 1.2539626629094751, + "grad_norm": 1.0348256826400757, + "learning_rate": 3.0737732163321596e-05, + "loss": 0.0152, + "step": 21360 + }, + { + "epoch": 1.2545497240812493, + "grad_norm": 0.04808567464351654, + "learning_rate": 3.0695066328330845e-05, + "loss": 0.0176, + "step": 21370 + }, + { + "epoch": 1.2551367852530233, + "grad_norm": 1.2392550706863403, + "learning_rate": 3.0652417006419674e-05, + "loss": 0.014, + "step": 21380 + }, + { + "epoch": 1.2557238464247975, + "grad_norm": 0.5190914273262024, + "learning_rate": 3.0609784234069575e-05, + "loss": 0.0101, + "step": 21390 + }, + { + "epoch": 1.2563109075965715, + "grad_norm": 0.9737521409988403, + "learning_rate": 3.0567168047747776e-05, + "loss": 0.016, + "step": 21400 + }, + { + "epoch": 1.2568979687683457, + "grad_norm": 0.3394123315811157, + "learning_rate": 3.052456848390739e-05, + "loss": 0.0171, + "step": 21410 + }, + { + "epoch": 1.2574850299401197, + "grad_norm": 0.7681484222412109, + "learning_rate": 3.048198557898727e-05, + "loss": 0.0111, + "step": 21420 + }, + { + "epoch": 1.2580720911118939, + "grad_norm": 3.3313300609588623, + "learning_rate": 3.043941936941207e-05, + "loss": 0.0257, + "step": 21430 + }, + { + "epoch": 1.258659152283668, + "grad_norm": 2.9273416996002197, + "learning_rate": 3.0396869891592093e-05, + "loss": 0.0101, + "step": 21440 + }, + { + "epoch": 1.259246213455442, + "grad_norm": 0.5039469599723816, + "learning_rate": 3.035433718192341e-05, + "loss": 0.004, + "step": 21450 + }, + { + "epoch": 1.259833274627216, + "grad_norm": 0.8747548460960388, + "learning_rate": 3.0311821276787654e-05, + "loss": 0.0208, + "step": 21460 + }, + { + "epoch": 1.2604203357989903, + "grad_norm": 1.177300214767456, + "learning_rate": 3.0269322212552153e-05, + "loss": 0.0215, + "step": 21470 + }, + { + "epoch": 1.2610073969707645, + "grad_norm": 1.727304458618164, + "learning_rate": 3.0226840025569857e-05, + "loss": 0.0152, + "step": 21480 + }, + { + "epoch": 1.2615944581425385, + "grad_norm": 0.13978305459022522, + "learning_rate": 3.0184374752179183e-05, + "loss": 0.0095, + "step": 21490 + }, + { + "epoch": 1.2621815193143124, + "grad_norm": 0.845206081867218, + "learning_rate": 3.014192642870416e-05, + "loss": 0.0064, + "step": 21500 + }, + { + "epoch": 1.2627685804860866, + "grad_norm": 1.3671202659606934, + "learning_rate": 3.0099495091454268e-05, + "loss": 0.0064, + "step": 21510 + }, + { + "epoch": 1.2633556416578609, + "grad_norm": 1.586337685585022, + "learning_rate": 3.00570807767245e-05, + "loss": 0.0128, + "step": 21520 + }, + { + "epoch": 1.2639427028296348, + "grad_norm": 0.500262439250946, + "learning_rate": 3.0014683520795256e-05, + "loss": 0.0179, + "step": 21530 + }, + { + "epoch": 1.2645297640014088, + "grad_norm": 0.5322319269180298, + "learning_rate": 2.9972303359932386e-05, + "loss": 0.0098, + "step": 21540 + }, + { + "epoch": 1.265116825173183, + "grad_norm": 1.2702996730804443, + "learning_rate": 2.992994033038704e-05, + "loss": 0.0176, + "step": 21550 + }, + { + "epoch": 1.2657038863449572, + "grad_norm": 1.5824235677719116, + "learning_rate": 2.9887594468395798e-05, + "loss": 0.0122, + "step": 21560 + }, + { + "epoch": 1.2662909475167312, + "grad_norm": 0.1891796886920929, + "learning_rate": 2.984526581018049e-05, + "loss": 0.0161, + "step": 21570 + }, + { + "epoch": 1.2668780086885054, + "grad_norm": 0.02000442147254944, + "learning_rate": 2.9802954391948294e-05, + "loss": 0.0098, + "step": 21580 + }, + { + "epoch": 1.2674650698602794, + "grad_norm": 2.0282230377197266, + "learning_rate": 2.976066024989158e-05, + "loss": 0.0182, + "step": 21590 + }, + { + "epoch": 1.2680521310320536, + "grad_norm": 0.06673241406679153, + "learning_rate": 2.9718383420187983e-05, + "loss": 0.0082, + "step": 21600 + }, + { + "epoch": 1.2686391922038276, + "grad_norm": 1.3715343475341797, + "learning_rate": 2.96761239390003e-05, + "loss": 0.0194, + "step": 21610 + }, + { + "epoch": 1.2692262533756018, + "grad_norm": 2.4239096641540527, + "learning_rate": 2.963388184247651e-05, + "loss": 0.0145, + "step": 21620 + }, + { + "epoch": 1.2698133145473758, + "grad_norm": 0.1532154381275177, + "learning_rate": 2.959165716674973e-05, + "loss": 0.0105, + "step": 21630 + }, + { + "epoch": 1.27040037571915, + "grad_norm": 0.31090089678764343, + "learning_rate": 2.9549449947938108e-05, + "loss": 0.0083, + "step": 21640 + }, + { + "epoch": 1.270987436890924, + "grad_norm": 0.05951221287250519, + "learning_rate": 2.9507260222144973e-05, + "loss": 0.0084, + "step": 21650 + }, + { + "epoch": 1.2715744980626982, + "grad_norm": 2.5123090744018555, + "learning_rate": 2.9465088025458586e-05, + "loss": 0.0124, + "step": 21660 + }, + { + "epoch": 1.2721615592344722, + "grad_norm": 0.24349847435951233, + "learning_rate": 2.942293339395227e-05, + "loss": 0.0056, + "step": 21670 + }, + { + "epoch": 1.2727486204062464, + "grad_norm": 0.0393960103392601, + "learning_rate": 2.9380796363684303e-05, + "loss": 0.0151, + "step": 21680 + }, + { + "epoch": 1.2733356815780204, + "grad_norm": 0.1087142676115036, + "learning_rate": 2.9338676970697926e-05, + "loss": 0.0083, + "step": 21690 + }, + { + "epoch": 1.2739227427497946, + "grad_norm": 0.1377180814743042, + "learning_rate": 2.9296575251021265e-05, + "loss": 0.0116, + "step": 21700 + }, + { + "epoch": 1.2745098039215685, + "grad_norm": 0.7107512950897217, + "learning_rate": 2.925449124066737e-05, + "loss": 0.01, + "step": 21710 + }, + { + "epoch": 1.2750968650933427, + "grad_norm": 1.46084725856781, + "learning_rate": 2.9212424975634078e-05, + "loss": 0.0111, + "step": 21720 + }, + { + "epoch": 1.275683926265117, + "grad_norm": 0.9276548027992249, + "learning_rate": 2.9170376491904127e-05, + "loss": 0.0143, + "step": 21730 + }, + { + "epoch": 1.276270987436891, + "grad_norm": 1.665923833847046, + "learning_rate": 2.912834582544497e-05, + "loss": 0.0204, + "step": 21740 + }, + { + "epoch": 1.276858048608665, + "grad_norm": 0.3215520679950714, + "learning_rate": 2.9086333012208865e-05, + "loss": 0.0217, + "step": 21750 + }, + { + "epoch": 1.2774451097804391, + "grad_norm": 0.5315148234367371, + "learning_rate": 2.9044338088132816e-05, + "loss": 0.0176, + "step": 21760 + }, + { + "epoch": 1.2780321709522133, + "grad_norm": 3.6604113578796387, + "learning_rate": 2.9002361089138453e-05, + "loss": 0.0093, + "step": 21770 + }, + { + "epoch": 1.2786192321239873, + "grad_norm": 0.34918174147605896, + "learning_rate": 2.896040205113214e-05, + "loss": 0.0088, + "step": 21780 + }, + { + "epoch": 1.2792062932957613, + "grad_norm": 1.7474963665008545, + "learning_rate": 2.8918461010004842e-05, + "loss": 0.0219, + "step": 21790 + }, + { + "epoch": 1.2797933544675355, + "grad_norm": 2.510493516921997, + "learning_rate": 2.887653800163218e-05, + "loss": 0.0153, + "step": 21800 + }, + { + "epoch": 1.2803804156393097, + "grad_norm": 4.96450662612915, + "learning_rate": 2.8834633061874256e-05, + "loss": 0.0179, + "step": 21810 + }, + { + "epoch": 1.2809674768110837, + "grad_norm": 0.05940713733434677, + "learning_rate": 2.87927462265758e-05, + "loss": 0.0116, + "step": 21820 + }, + { + "epoch": 1.2815545379828577, + "grad_norm": 0.5804621577262878, + "learning_rate": 2.875087753156603e-05, + "loss": 0.0116, + "step": 21830 + }, + { + "epoch": 1.2821415991546319, + "grad_norm": 0.5918548107147217, + "learning_rate": 2.8709027012658663e-05, + "loss": 0.013, + "step": 21840 + }, + { + "epoch": 1.282728660326406, + "grad_norm": 0.11733749508857727, + "learning_rate": 2.8667194705651807e-05, + "loss": 0.0233, + "step": 21850 + }, + { + "epoch": 1.28331572149818, + "grad_norm": 0.25058963894844055, + "learning_rate": 2.862538064632808e-05, + "loss": 0.0153, + "step": 21860 + }, + { + "epoch": 1.2839027826699543, + "grad_norm": 1.7111525535583496, + "learning_rate": 2.858358487045441e-05, + "loss": 0.0276, + "step": 21870 + }, + { + "epoch": 1.2844898438417283, + "grad_norm": 1.1210541725158691, + "learning_rate": 2.854180741378214e-05, + "loss": 0.0048, + "step": 21880 + }, + { + "epoch": 1.2850769050135025, + "grad_norm": 2.3163340091705322, + "learning_rate": 2.8500048312046927e-05, + "loss": 0.0256, + "step": 21890 + }, + { + "epoch": 1.2856639661852765, + "grad_norm": 0.0008968147449195385, + "learning_rate": 2.8458307600968725e-05, + "loss": 0.0158, + "step": 21900 + }, + { + "epoch": 1.2862510273570507, + "grad_norm": 1.1192528009414673, + "learning_rate": 2.8416585316251776e-05, + "loss": 0.0064, + "step": 21910 + }, + { + "epoch": 1.2868380885288246, + "grad_norm": 3.4580559730529785, + "learning_rate": 2.8374881493584516e-05, + "loss": 0.0165, + "step": 21920 + }, + { + "epoch": 1.2874251497005988, + "grad_norm": 2.1928298473358154, + "learning_rate": 2.8333196168639632e-05, + "loss": 0.0136, + "step": 21930 + }, + { + "epoch": 1.2880122108723728, + "grad_norm": 0.12867216765880585, + "learning_rate": 2.8291529377073956e-05, + "loss": 0.0099, + "step": 21940 + }, + { + "epoch": 1.288599272044147, + "grad_norm": 0.6393283605575562, + "learning_rate": 2.824988115452849e-05, + "loss": 0.0263, + "step": 21950 + }, + { + "epoch": 1.289186333215921, + "grad_norm": 1.985285997390747, + "learning_rate": 2.8208251536628344e-05, + "loss": 0.0208, + "step": 21960 + }, + { + "epoch": 1.2897733943876952, + "grad_norm": 0.04826156795024872, + "learning_rate": 2.8166640558982743e-05, + "loss": 0.0117, + "step": 21970 + }, + { + "epoch": 1.2903604555594694, + "grad_norm": 0.5573629140853882, + "learning_rate": 2.8125048257184896e-05, + "loss": 0.0066, + "step": 21980 + }, + { + "epoch": 1.2909475167312434, + "grad_norm": 3.0405917167663574, + "learning_rate": 2.8083474666812127e-05, + "loss": 0.0108, + "step": 21990 + }, + { + "epoch": 1.2915345779030174, + "grad_norm": 0.31120508909225464, + "learning_rate": 2.8041919823425633e-05, + "loss": 0.0081, + "step": 22000 + }, + { + "epoch": 1.2921216390747916, + "grad_norm": 0.12164461612701416, + "learning_rate": 2.800038376257075e-05, + "loss": 0.0214, + "step": 22010 + }, + { + "epoch": 1.2927087002465658, + "grad_norm": 0.35867759585380554, + "learning_rate": 2.7958866519776572e-05, + "loss": 0.0056, + "step": 22020 + }, + { + "epoch": 1.2932957614183398, + "grad_norm": 0.20889237523078918, + "learning_rate": 2.791736813055621e-05, + "loss": 0.0195, + "step": 22030 + }, + { + "epoch": 1.2938828225901138, + "grad_norm": 1.40049147605896, + "learning_rate": 2.787588863040661e-05, + "loss": 0.0198, + "step": 22040 + }, + { + "epoch": 1.294469883761888, + "grad_norm": 1.6490122079849243, + "learning_rate": 2.7834428054808543e-05, + "loss": 0.0043, + "step": 22050 + }, + { + "epoch": 1.2950569449336622, + "grad_norm": 4.0196990966796875, + "learning_rate": 2.7792986439226615e-05, + "loss": 0.0158, + "step": 22060 + }, + { + "epoch": 1.2956440061054362, + "grad_norm": 2.3238790035247803, + "learning_rate": 2.7751563819109218e-05, + "loss": 0.0171, + "step": 22070 + }, + { + "epoch": 1.2962310672772102, + "grad_norm": 1.6786881685256958, + "learning_rate": 2.7710160229888504e-05, + "loss": 0.0195, + "step": 22080 + }, + { + "epoch": 1.2968181284489844, + "grad_norm": 0.03470773994922638, + "learning_rate": 2.7668775706980288e-05, + "loss": 0.0244, + "step": 22090 + }, + { + "epoch": 1.2974051896207586, + "grad_norm": 0.21821852028369904, + "learning_rate": 2.7627410285784163e-05, + "loss": 0.0053, + "step": 22100 + }, + { + "epoch": 1.2979922507925326, + "grad_norm": 1.1397291421890259, + "learning_rate": 2.7586064001683286e-05, + "loss": 0.004, + "step": 22110 + }, + { + "epoch": 1.2985793119643068, + "grad_norm": 1.734389066696167, + "learning_rate": 2.754473689004453e-05, + "loss": 0.0085, + "step": 22120 + }, + { + "epoch": 1.2991663731360807, + "grad_norm": 2.2286031246185303, + "learning_rate": 2.750342898621833e-05, + "loss": 0.012, + "step": 22130 + }, + { + "epoch": 1.299753434307855, + "grad_norm": 0.5030475854873657, + "learning_rate": 2.7462140325538714e-05, + "loss": 0.0134, + "step": 22140 + }, + { + "epoch": 1.300340495479629, + "grad_norm": 0.7477059364318848, + "learning_rate": 2.7420870943323197e-05, + "loss": 0.0067, + "step": 22150 + }, + { + "epoch": 1.3009275566514031, + "grad_norm": 0.13905635476112366, + "learning_rate": 2.7379620874872856e-05, + "loss": 0.0042, + "step": 22160 + }, + { + "epoch": 1.3015146178231771, + "grad_norm": 1.4355264902114868, + "learning_rate": 2.7338390155472215e-05, + "loss": 0.0107, + "step": 22170 + }, + { + "epoch": 1.3021016789949513, + "grad_norm": 1.68543541431427, + "learning_rate": 2.729717882038925e-05, + "loss": 0.0037, + "step": 22180 + }, + { + "epoch": 1.3026887401667253, + "grad_norm": 1.7076072692871094, + "learning_rate": 2.725598690487543e-05, + "loss": 0.0069, + "step": 22190 + }, + { + "epoch": 1.3032758013384995, + "grad_norm": 0.7237100601196289, + "learning_rate": 2.721481444416548e-05, + "loss": 0.0096, + "step": 22200 + }, + { + "epoch": 1.3038628625102735, + "grad_norm": 0.8334245681762695, + "learning_rate": 2.7173661473477608e-05, + "loss": 0.0178, + "step": 22210 + }, + { + "epoch": 1.3044499236820477, + "grad_norm": 2.110283851623535, + "learning_rate": 2.7132528028013248e-05, + "loss": 0.018, + "step": 22220 + }, + { + "epoch": 1.3050369848538217, + "grad_norm": 2.716474771499634, + "learning_rate": 2.7091414142957204e-05, + "loss": 0.0135, + "step": 22230 + }, + { + "epoch": 1.305624046025596, + "grad_norm": 0.5923571586608887, + "learning_rate": 2.7050319853477522e-05, + "loss": 0.016, + "step": 22240 + }, + { + "epoch": 1.3062111071973699, + "grad_norm": 0.10442431271076202, + "learning_rate": 2.7009245194725507e-05, + "loss": 0.0083, + "step": 22250 + }, + { + "epoch": 1.306798168369144, + "grad_norm": 0.6889320015907288, + "learning_rate": 2.6968190201835625e-05, + "loss": 0.0144, + "step": 22260 + }, + { + "epoch": 1.3073852295409183, + "grad_norm": 0.5491931438446045, + "learning_rate": 2.6927154909925577e-05, + "loss": 0.008, + "step": 22270 + }, + { + "epoch": 1.3079722907126923, + "grad_norm": 0.1568625420331955, + "learning_rate": 2.6886139354096164e-05, + "loss": 0.0103, + "step": 22280 + }, + { + "epoch": 1.3085593518844663, + "grad_norm": 0.10188999027013779, + "learning_rate": 2.684514356943132e-05, + "loss": 0.0048, + "step": 22290 + }, + { + "epoch": 1.3091464130562405, + "grad_norm": 2.802391529083252, + "learning_rate": 2.6804167590998096e-05, + "loss": 0.0183, + "step": 22300 + }, + { + "epoch": 1.3097334742280147, + "grad_norm": 1.9018369913101196, + "learning_rate": 2.676321145384657e-05, + "loss": 0.0154, + "step": 22310 + }, + { + "epoch": 1.3103205353997887, + "grad_norm": 3.2427594661712646, + "learning_rate": 2.6722275193009872e-05, + "loss": 0.0334, + "step": 22320 + }, + { + "epoch": 1.3109075965715626, + "grad_norm": 0.24874800443649292, + "learning_rate": 2.668135884350408e-05, + "loss": 0.0327, + "step": 22330 + }, + { + "epoch": 1.3114946577433368, + "grad_norm": 1.3628442287445068, + "learning_rate": 2.664046244032832e-05, + "loss": 0.0112, + "step": 22340 + }, + { + "epoch": 1.312081718915111, + "grad_norm": 0.3787357807159424, + "learning_rate": 2.659958601846454e-05, + "loss": 0.0068, + "step": 22350 + }, + { + "epoch": 1.312668780086885, + "grad_norm": 0.7853474020957947, + "learning_rate": 2.6558729612877753e-05, + "loss": 0.0106, + "step": 22360 + }, + { + "epoch": 1.313255841258659, + "grad_norm": 0.7099056243896484, + "learning_rate": 2.6517893258515702e-05, + "loss": 0.0129, + "step": 22370 + }, + { + "epoch": 1.3138429024304332, + "grad_norm": 2.357701301574707, + "learning_rate": 2.647707699030909e-05, + "loss": 0.0093, + "step": 22380 + }, + { + "epoch": 1.3144299636022074, + "grad_norm": 2.000049114227295, + "learning_rate": 2.6436280843171346e-05, + "loss": 0.0129, + "step": 22390 + }, + { + "epoch": 1.3150170247739814, + "grad_norm": 0.21669819951057434, + "learning_rate": 2.639550485199874e-05, + "loss": 0.0135, + "step": 22400 + }, + { + "epoch": 1.3156040859457556, + "grad_norm": 1.1428183317184448, + "learning_rate": 2.635474905167032e-05, + "loss": 0.0173, + "step": 22410 + }, + { + "epoch": 1.3161911471175296, + "grad_norm": 0.4165472388267517, + "learning_rate": 2.631401347704783e-05, + "loss": 0.0169, + "step": 22420 + }, + { + "epoch": 1.3167782082893038, + "grad_norm": 3.154531717300415, + "learning_rate": 2.627329816297569e-05, + "loss": 0.0094, + "step": 22430 + }, + { + "epoch": 1.3173652694610778, + "grad_norm": 0.2751377522945404, + "learning_rate": 2.6232603144281066e-05, + "loss": 0.007, + "step": 22440 + }, + { + "epoch": 1.317952330632852, + "grad_norm": 0.9876483082771301, + "learning_rate": 2.6191928455773662e-05, + "loss": 0.0088, + "step": 22450 + }, + { + "epoch": 1.318539391804626, + "grad_norm": 0.5597164034843445, + "learning_rate": 2.615127413224588e-05, + "loss": 0.0081, + "step": 22460 + }, + { + "epoch": 1.3191264529764002, + "grad_norm": 1.2955923080444336, + "learning_rate": 2.611064020847266e-05, + "loss": 0.0173, + "step": 22470 + }, + { + "epoch": 1.3197135141481742, + "grad_norm": 1.323433756828308, + "learning_rate": 2.6070026719211505e-05, + "loss": 0.0195, + "step": 22480 + }, + { + "epoch": 1.3203005753199484, + "grad_norm": 0.15278884768486023, + "learning_rate": 2.6029433699202454e-05, + "loss": 0.027, + "step": 22490 + }, + { + "epoch": 1.3208876364917224, + "grad_norm": 2.438828229904175, + "learning_rate": 2.598886118316798e-05, + "loss": 0.0103, + "step": 22500 + }, + { + "epoch": 1.3214746976634966, + "grad_norm": 2.481713056564331, + "learning_rate": 2.5948309205813094e-05, + "loss": 0.0161, + "step": 22510 + }, + { + "epoch": 1.3220617588352708, + "grad_norm": 0.1825207769870758, + "learning_rate": 2.590777780182515e-05, + "loss": 0.0155, + "step": 22520 + }, + { + "epoch": 1.3226488200070448, + "grad_norm": 0.00188036251347512, + "learning_rate": 2.5867267005873996e-05, + "loss": 0.0092, + "step": 22530 + }, + { + "epoch": 1.3232358811788187, + "grad_norm": 0.37991979718208313, + "learning_rate": 2.582677685261179e-05, + "loss": 0.0087, + "step": 22540 + }, + { + "epoch": 1.323822942350593, + "grad_norm": 1.4987987279891968, + "learning_rate": 2.578630737667308e-05, + "loss": 0.0061, + "step": 22550 + }, + { + "epoch": 1.3244100035223672, + "grad_norm": 0.6876380443572998, + "learning_rate": 2.574585861267466e-05, + "loss": 0.0281, + "step": 22560 + }, + { + "epoch": 1.3249970646941411, + "grad_norm": 0.09047604352235794, + "learning_rate": 2.570543059521569e-05, + "loss": 0.0185, + "step": 22570 + }, + { + "epoch": 1.3255841258659151, + "grad_norm": 3.7617619037628174, + "learning_rate": 2.566502335887747e-05, + "loss": 0.0184, + "step": 22580 + }, + { + "epoch": 1.3261711870376893, + "grad_norm": 0.9839680194854736, + "learning_rate": 2.5624636938223675e-05, + "loss": 0.014, + "step": 22590 + }, + { + "epoch": 1.3267582482094635, + "grad_norm": 0.8359461426734924, + "learning_rate": 2.5584271367800072e-05, + "loss": 0.011, + "step": 22600 + }, + { + "epoch": 1.3273453093812375, + "grad_norm": 3.1844050884246826, + "learning_rate": 2.5543926682134588e-05, + "loss": 0.0076, + "step": 22610 + }, + { + "epoch": 1.3279323705530115, + "grad_norm": 0.13429057598114014, + "learning_rate": 2.550360291573735e-05, + "loss": 0.0169, + "step": 22620 + }, + { + "epoch": 1.3285194317247857, + "grad_norm": 0.23351898789405823, + "learning_rate": 2.546330010310052e-05, + "loss": 0.0119, + "step": 22630 + }, + { + "epoch": 1.32910649289656, + "grad_norm": 0.8100205659866333, + "learning_rate": 2.5423018278698386e-05, + "loss": 0.0222, + "step": 22640 + }, + { + "epoch": 1.329693554068334, + "grad_norm": 0.8847802877426147, + "learning_rate": 2.5382757476987268e-05, + "loss": 0.0112, + "step": 22650 + }, + { + "epoch": 1.330280615240108, + "grad_norm": 0.1871187388896942, + "learning_rate": 2.5342517732405523e-05, + "loss": 0.0041, + "step": 22660 + }, + { + "epoch": 1.330867676411882, + "grad_norm": 0.09971107542514801, + "learning_rate": 2.530229907937344e-05, + "loss": 0.0098, + "step": 22670 + }, + { + "epoch": 1.3314547375836563, + "grad_norm": 0.7304926514625549, + "learning_rate": 2.5262101552293345e-05, + "loss": 0.0163, + "step": 22680 + }, + { + "epoch": 1.3320417987554303, + "grad_norm": 1.4849220514297485, + "learning_rate": 2.52219251855494e-05, + "loss": 0.0081, + "step": 22690 + }, + { + "epoch": 1.3326288599272045, + "grad_norm": 7.254383087158203, + "learning_rate": 2.5181770013507754e-05, + "loss": 0.0161, + "step": 22700 + }, + { + "epoch": 1.3332159210989785, + "grad_norm": 0.7523486614227295, + "learning_rate": 2.5141636070516382e-05, + "loss": 0.0136, + "step": 22710 + }, + { + "epoch": 1.3338029822707527, + "grad_norm": 2.257286787033081, + "learning_rate": 2.5101523390905112e-05, + "loss": 0.0138, + "step": 22720 + }, + { + "epoch": 1.3343900434425267, + "grad_norm": 0.23006044328212738, + "learning_rate": 2.5061432008985598e-05, + "loss": 0.0158, + "step": 22730 + }, + { + "epoch": 1.3349771046143009, + "grad_norm": 0.6245858669281006, + "learning_rate": 2.5021361959051226e-05, + "loss": 0.0455, + "step": 22740 + }, + { + "epoch": 1.3355641657860748, + "grad_norm": 0.9592316746711731, + "learning_rate": 2.4981313275377177e-05, + "loss": 0.005, + "step": 22750 + }, + { + "epoch": 1.336151226957849, + "grad_norm": 1.2934941053390503, + "learning_rate": 2.4941285992220354e-05, + "loss": 0.0172, + "step": 22760 + }, + { + "epoch": 1.336738288129623, + "grad_norm": 4.774064064025879, + "learning_rate": 2.4901280143819368e-05, + "loss": 0.0157, + "step": 22770 + }, + { + "epoch": 1.3373253493013972, + "grad_norm": 3.1580262184143066, + "learning_rate": 2.4861295764394426e-05, + "loss": 0.0224, + "step": 22780 + }, + { + "epoch": 1.3379124104731712, + "grad_norm": 0.2299729436635971, + "learning_rate": 2.482133288814747e-05, + "loss": 0.0076, + "step": 22790 + }, + { + "epoch": 1.3384994716449454, + "grad_norm": 0.013382161036133766, + "learning_rate": 2.4781391549261955e-05, + "loss": 0.0111, + "step": 22800 + }, + { + "epoch": 1.3390865328167196, + "grad_norm": 0.6786443591117859, + "learning_rate": 2.4741471781902975e-05, + "loss": 0.0047, + "step": 22810 + }, + { + "epoch": 1.3396735939884936, + "grad_norm": 0.010400927625596523, + "learning_rate": 2.470157362021715e-05, + "loss": 0.0061, + "step": 22820 + }, + { + "epoch": 1.3402606551602676, + "grad_norm": 1.5406306982040405, + "learning_rate": 2.4661697098332648e-05, + "loss": 0.0178, + "step": 22830 + }, + { + "epoch": 1.3408477163320418, + "grad_norm": 0.1092362105846405, + "learning_rate": 2.462184225035905e-05, + "loss": 0.0153, + "step": 22840 + }, + { + "epoch": 1.341434777503816, + "grad_norm": 2.1447012424468994, + "learning_rate": 2.4582009110387506e-05, + "loss": 0.0172, + "step": 22850 + }, + { + "epoch": 1.34202183867559, + "grad_norm": 2.2344930171966553, + "learning_rate": 2.4542197712490483e-05, + "loss": 0.0075, + "step": 22860 + }, + { + "epoch": 1.342608899847364, + "grad_norm": 0.5468419194221497, + "learning_rate": 2.4502408090721934e-05, + "loss": 0.0165, + "step": 22870 + }, + { + "epoch": 1.3431959610191382, + "grad_norm": 1.6260221004486084, + "learning_rate": 2.446264027911716e-05, + "loss": 0.0333, + "step": 22880 + }, + { + "epoch": 1.3437830221909124, + "grad_norm": 0.43379461765289307, + "learning_rate": 2.4422894311692807e-05, + "loss": 0.0044, + "step": 22890 + }, + { + "epoch": 1.3443700833626864, + "grad_norm": 1.4105299711227417, + "learning_rate": 2.438317022244684e-05, + "loss": 0.0206, + "step": 22900 + }, + { + "epoch": 1.3449571445344604, + "grad_norm": 0.469251811504364, + "learning_rate": 2.4343468045358476e-05, + "loss": 0.0212, + "step": 22910 + }, + { + "epoch": 1.3455442057062346, + "grad_norm": 0.9096851944923401, + "learning_rate": 2.4303787814388247e-05, + "loss": 0.0126, + "step": 22920 + }, + { + "epoch": 1.3461312668780088, + "grad_norm": 1.3153198957443237, + "learning_rate": 2.4264129563477822e-05, + "loss": 0.0221, + "step": 22930 + }, + { + "epoch": 1.3467183280497828, + "grad_norm": 1.9034795761108398, + "learning_rate": 2.4224493326550214e-05, + "loss": 0.0174, + "step": 22940 + }, + { + "epoch": 1.347305389221557, + "grad_norm": 0.34362080693244934, + "learning_rate": 2.418487913750946e-05, + "loss": 0.0102, + "step": 22950 + }, + { + "epoch": 1.347892450393331, + "grad_norm": 0.43925729393959045, + "learning_rate": 2.4145287030240826e-05, + "loss": 0.0096, + "step": 22960 + }, + { + "epoch": 1.3484795115651051, + "grad_norm": 0.2632419764995575, + "learning_rate": 2.410571703861063e-05, + "loss": 0.0029, + "step": 22970 + }, + { + "epoch": 1.3490665727368791, + "grad_norm": 0.3569221496582031, + "learning_rate": 2.4066169196466326e-05, + "loss": 0.0064, + "step": 22980 + }, + { + "epoch": 1.3496536339086533, + "grad_norm": 0.8634092211723328, + "learning_rate": 2.4026643537636395e-05, + "loss": 0.0111, + "step": 22990 + }, + { + "epoch": 1.3502406950804273, + "grad_norm": 2.1359846591949463, + "learning_rate": 2.3987140095930343e-05, + "loss": 0.0106, + "step": 23000 + }, + { + "epoch": 1.3508277562522015, + "grad_norm": 0.8999951481819153, + "learning_rate": 2.3947658905138702e-05, + "loss": 0.0095, + "step": 23010 + }, + { + "epoch": 1.3514148174239755, + "grad_norm": 4.4071526527404785, + "learning_rate": 2.3908199999032904e-05, + "loss": 0.0182, + "step": 23020 + }, + { + "epoch": 1.3520018785957497, + "grad_norm": 2.316470146179199, + "learning_rate": 2.3868763411365396e-05, + "loss": 0.0225, + "step": 23030 + }, + { + "epoch": 1.3525889397675237, + "grad_norm": 0.017609085887670517, + "learning_rate": 2.382934917586947e-05, + "loss": 0.0055, + "step": 23040 + }, + { + "epoch": 1.353176000939298, + "grad_norm": 1.3149291276931763, + "learning_rate": 2.378995732625933e-05, + "loss": 0.0051, + "step": 23050 + }, + { + "epoch": 1.353763062111072, + "grad_norm": 0.7345719933509827, + "learning_rate": 2.375058789623004e-05, + "loss": 0.0119, + "step": 23060 + }, + { + "epoch": 1.354350123282846, + "grad_norm": 1.8294070959091187, + "learning_rate": 2.3711240919457493e-05, + "loss": 0.035, + "step": 23070 + }, + { + "epoch": 1.35493718445462, + "grad_norm": 0.23536460101604462, + "learning_rate": 2.367191642959832e-05, + "loss": 0.0137, + "step": 23080 + }, + { + "epoch": 1.3555242456263943, + "grad_norm": 0.8988510966300964, + "learning_rate": 2.3632614460289985e-05, + "loss": 0.0106, + "step": 23090 + }, + { + "epoch": 1.3561113067981685, + "grad_norm": 0.06487403064966202, + "learning_rate": 2.3593335045150626e-05, + "loss": 0.014, + "step": 23100 + }, + { + "epoch": 1.3566983679699425, + "grad_norm": 0.8209073543548584, + "learning_rate": 2.3554078217779145e-05, + "loss": 0.0156, + "step": 23110 + }, + { + "epoch": 1.3572854291417165, + "grad_norm": 0.07240058481693268, + "learning_rate": 2.3514844011755087e-05, + "loss": 0.0033, + "step": 23120 + }, + { + "epoch": 1.3578724903134907, + "grad_norm": 1.495160698890686, + "learning_rate": 2.3475632460638692e-05, + "loss": 0.0036, + "step": 23130 + }, + { + "epoch": 1.3584595514852649, + "grad_norm": 0.2292553335428238, + "learning_rate": 2.3436443597970735e-05, + "loss": 0.0262, + "step": 23140 + }, + { + "epoch": 1.3590466126570389, + "grad_norm": 0.006750187836587429, + "learning_rate": 2.3397277457272665e-05, + "loss": 0.0119, + "step": 23150 + }, + { + "epoch": 1.3596336738288128, + "grad_norm": 0.023953752592206, + "learning_rate": 2.3358134072046466e-05, + "loss": 0.0137, + "step": 23160 + }, + { + "epoch": 1.360220735000587, + "grad_norm": 1.2029908895492554, + "learning_rate": 2.331901347577466e-05, + "loss": 0.0134, + "step": 23170 + }, + { + "epoch": 1.3608077961723613, + "grad_norm": 2.6604292392730713, + "learning_rate": 2.327991570192029e-05, + "loss": 0.0098, + "step": 23180 + }, + { + "epoch": 1.3613948573441352, + "grad_norm": 1.4938769340515137, + "learning_rate": 2.3240840783926827e-05, + "loss": 0.014, + "step": 23190 + }, + { + "epoch": 1.3619819185159094, + "grad_norm": 0.8838220238685608, + "learning_rate": 2.320178875521826e-05, + "loss": 0.0113, + "step": 23200 + }, + { + "epoch": 1.3625689796876834, + "grad_norm": 1.2276970148086548, + "learning_rate": 2.3162759649198928e-05, + "loss": 0.0059, + "step": 23210 + }, + { + "epoch": 1.3631560408594576, + "grad_norm": 3.7173707485198975, + "learning_rate": 2.3123753499253618e-05, + "loss": 0.0206, + "step": 23220 + }, + { + "epoch": 1.3637431020312316, + "grad_norm": 2.413621187210083, + "learning_rate": 2.3084770338747464e-05, + "loss": 0.0074, + "step": 23230 + }, + { + "epoch": 1.3643301632030058, + "grad_norm": 0.09392088651657104, + "learning_rate": 2.3045810201025946e-05, + "loss": 0.0068, + "step": 23240 + }, + { + "epoch": 1.3649172243747798, + "grad_norm": 1.4007784128189087, + "learning_rate": 2.300687311941481e-05, + "loss": 0.009, + "step": 23250 + }, + { + "epoch": 1.365504285546554, + "grad_norm": 0.1655406653881073, + "learning_rate": 2.296795912722014e-05, + "loss": 0.0099, + "step": 23260 + }, + { + "epoch": 1.366091346718328, + "grad_norm": 1.608480453491211, + "learning_rate": 2.29290682577282e-05, + "loss": 0.0215, + "step": 23270 + }, + { + "epoch": 1.3666784078901022, + "grad_norm": 0.3631249666213989, + "learning_rate": 2.2890200544205516e-05, + "loss": 0.0135, + "step": 23280 + }, + { + "epoch": 1.3672654690618762, + "grad_norm": 2.106283664703369, + "learning_rate": 2.285135601989885e-05, + "loss": 0.0062, + "step": 23290 + }, + { + "epoch": 1.3678525302336504, + "grad_norm": 1.1901888847351074, + "learning_rate": 2.2812534718035046e-05, + "loss": 0.0165, + "step": 23300 + }, + { + "epoch": 1.3684395914054244, + "grad_norm": 0.20205309987068176, + "learning_rate": 2.277373667182114e-05, + "loss": 0.0103, + "step": 23310 + }, + { + "epoch": 1.3690266525771986, + "grad_norm": 0.9821584820747375, + "learning_rate": 2.2734961914444225e-05, + "loss": 0.0125, + "step": 23320 + }, + { + "epoch": 1.3696137137489726, + "grad_norm": 0.27546176314353943, + "learning_rate": 2.2696210479071524e-05, + "loss": 0.0118, + "step": 23330 + }, + { + "epoch": 1.3702007749207468, + "grad_norm": 0.06945142149925232, + "learning_rate": 2.2657482398850287e-05, + "loss": 0.0089, + "step": 23340 + }, + { + "epoch": 1.370787836092521, + "grad_norm": 0.027274999767541885, + "learning_rate": 2.261877770690781e-05, + "loss": 0.009, + "step": 23350 + }, + { + "epoch": 1.371374897264295, + "grad_norm": 1.9020315408706665, + "learning_rate": 2.2580096436351333e-05, + "loss": 0.0123, + "step": 23360 + }, + { + "epoch": 1.371961958436069, + "grad_norm": 0.11881349980831146, + "learning_rate": 2.2541438620268124e-05, + "loss": 0.0116, + "step": 23370 + }, + { + "epoch": 1.3725490196078431, + "grad_norm": 0.720493733882904, + "learning_rate": 2.2502804291725315e-05, + "loss": 0.0147, + "step": 23380 + }, + { + "epoch": 1.3731360807796174, + "grad_norm": 0.09386742860078812, + "learning_rate": 2.246419348377001e-05, + "loss": 0.0091, + "step": 23390 + }, + { + "epoch": 1.3737231419513913, + "grad_norm": 1.8141838312149048, + "learning_rate": 2.242560622942918e-05, + "loss": 0.0304, + "step": 23400 + }, + { + "epoch": 1.3743102031231653, + "grad_norm": 2.572324275970459, + "learning_rate": 2.2387042561709654e-05, + "loss": 0.0196, + "step": 23410 + }, + { + "epoch": 1.3748972642949395, + "grad_norm": 1.1925442218780518, + "learning_rate": 2.2348502513598035e-05, + "loss": 0.0103, + "step": 23420 + }, + { + "epoch": 1.3754843254667137, + "grad_norm": 2.4145772457122803, + "learning_rate": 2.2309986118060784e-05, + "loss": 0.0187, + "step": 23430 + }, + { + "epoch": 1.3760713866384877, + "grad_norm": 1.7221524715423584, + "learning_rate": 2.227149340804412e-05, + "loss": 0.0118, + "step": 23440 + }, + { + "epoch": 1.3766584478102617, + "grad_norm": 0.2988492548465729, + "learning_rate": 2.2233024416473948e-05, + "loss": 0.0124, + "step": 23450 + }, + { + "epoch": 1.377245508982036, + "grad_norm": 1.0698436498641968, + "learning_rate": 2.2194579176255954e-05, + "loss": 0.0214, + "step": 23460 + }, + { + "epoch": 1.37783257015381, + "grad_norm": 0.5979824066162109, + "learning_rate": 2.215615772027546e-05, + "loss": 0.0147, + "step": 23470 + }, + { + "epoch": 1.378419631325584, + "grad_norm": 0.4322218596935272, + "learning_rate": 2.2117760081397506e-05, + "loss": 0.0318, + "step": 23480 + }, + { + "epoch": 1.3790066924973583, + "grad_norm": 0.012686857022345066, + "learning_rate": 2.2079386292466652e-05, + "loss": 0.0117, + "step": 23490 + }, + { + "epoch": 1.3795937536691323, + "grad_norm": 0.010577654466032982, + "learning_rate": 2.2041036386307173e-05, + "loss": 0.0051, + "step": 23500 + }, + { + "epoch": 1.3801808148409065, + "grad_norm": 1.2929799556732178, + "learning_rate": 2.2002710395722805e-05, + "loss": 0.0105, + "step": 23510 + }, + { + "epoch": 1.3807678760126805, + "grad_norm": 0.059585027396678925, + "learning_rate": 2.196440835349695e-05, + "loss": 0.0108, + "step": 23520 + }, + { + "epoch": 1.3813549371844547, + "grad_norm": 0.1867661476135254, + "learning_rate": 2.192613029239241e-05, + "loss": 0.0102, + "step": 23530 + }, + { + "epoch": 1.3819419983562287, + "grad_norm": 0.35511553287506104, + "learning_rate": 2.188787624515156e-05, + "loss": 0.0054, + "step": 23540 + }, + { + "epoch": 1.3825290595280029, + "grad_norm": 2.3536529541015625, + "learning_rate": 2.184964624449617e-05, + "loss": 0.0071, + "step": 23550 + }, + { + "epoch": 1.3831161206997769, + "grad_norm": 0.15178915858268738, + "learning_rate": 2.181144032312747e-05, + "loss": 0.0091, + "step": 23560 + }, + { + "epoch": 1.383703181871551, + "grad_norm": 1.1093653440475464, + "learning_rate": 2.1773258513726098e-05, + "loss": 0.006, + "step": 23570 + }, + { + "epoch": 1.384290243043325, + "grad_norm": 0.16067326068878174, + "learning_rate": 2.173510084895206e-05, + "loss": 0.0151, + "step": 23580 + }, + { + "epoch": 1.3848773042150992, + "grad_norm": 1.6919939517974854, + "learning_rate": 2.1696967361444733e-05, + "loss": 0.0138, + "step": 23590 + }, + { + "epoch": 1.3854643653868732, + "grad_norm": 0.9082183837890625, + "learning_rate": 2.165885808382275e-05, + "loss": 0.0139, + "step": 23600 + }, + { + "epoch": 1.3860514265586474, + "grad_norm": 0.01875101402401924, + "learning_rate": 2.16207730486841e-05, + "loss": 0.0036, + "step": 23610 + }, + { + "epoch": 1.3866384877304214, + "grad_norm": 1.4035495519638062, + "learning_rate": 2.1582712288605994e-05, + "loss": 0.0097, + "step": 23620 + }, + { + "epoch": 1.3872255489021956, + "grad_norm": 0.23025992512702942, + "learning_rate": 2.1544675836144907e-05, + "loss": 0.0157, + "step": 23630 + }, + { + "epoch": 1.3878126100739698, + "grad_norm": 0.8536239862442017, + "learning_rate": 2.1506663723836502e-05, + "loss": 0.008, + "step": 23640 + }, + { + "epoch": 1.3883996712457438, + "grad_norm": 0.34466981887817383, + "learning_rate": 2.146867598419565e-05, + "loss": 0.0099, + "step": 23650 + }, + { + "epoch": 1.3889867324175178, + "grad_norm": 0.10123822093009949, + "learning_rate": 2.1430712649716328e-05, + "loss": 0.0083, + "step": 23660 + }, + { + "epoch": 1.389573793589292, + "grad_norm": 0.6164001822471619, + "learning_rate": 2.1392773752871685e-05, + "loss": 0.0082, + "step": 23670 + }, + { + "epoch": 1.3901608547610662, + "grad_norm": 0.12118466943502426, + "learning_rate": 2.13548593261139e-05, + "loss": 0.0064, + "step": 23680 + }, + { + "epoch": 1.3907479159328402, + "grad_norm": 1.2965704202651978, + "learning_rate": 2.1316969401874316e-05, + "loss": 0.0186, + "step": 23690 + }, + { + "epoch": 1.3913349771046142, + "grad_norm": 0.15385189652442932, + "learning_rate": 2.1279104012563266e-05, + "loss": 0.017, + "step": 23700 + }, + { + "epoch": 1.3919220382763884, + "grad_norm": 2.4474310874938965, + "learning_rate": 2.1241263190570065e-05, + "loss": 0.0103, + "step": 23710 + }, + { + "epoch": 1.3925090994481626, + "grad_norm": 2.3678784370422363, + "learning_rate": 2.120344696826308e-05, + "loss": 0.0143, + "step": 23720 + }, + { + "epoch": 1.3930961606199366, + "grad_norm": 2.978165626525879, + "learning_rate": 2.1165655377989557e-05, + "loss": 0.0144, + "step": 23730 + }, + { + "epoch": 1.3936832217917106, + "grad_norm": 1.1261640787124634, + "learning_rate": 2.112788845207574e-05, + "loss": 0.0099, + "step": 23740 + }, + { + "epoch": 1.3942702829634848, + "grad_norm": 2.0954904556274414, + "learning_rate": 2.1090146222826758e-05, + "loss": 0.0294, + "step": 23750 + }, + { + "epoch": 1.394857344135259, + "grad_norm": 2.7357470989227295, + "learning_rate": 2.1052428722526614e-05, + "loss": 0.0124, + "step": 23760 + }, + { + "epoch": 1.395444405307033, + "grad_norm": 0.5364775657653809, + "learning_rate": 2.1014735983438126e-05, + "loss": 0.0232, + "step": 23770 + }, + { + "epoch": 1.3960314664788072, + "grad_norm": 1.5720865726470947, + "learning_rate": 2.0977068037802994e-05, + "loss": 0.0194, + "step": 23780 + }, + { + "epoch": 1.3966185276505811, + "grad_norm": 0.08704604208469391, + "learning_rate": 2.093942491784164e-05, + "loss": 0.0071, + "step": 23790 + }, + { + "epoch": 1.3972055888223553, + "grad_norm": 0.5620222687721252, + "learning_rate": 2.090180665575329e-05, + "loss": 0.0057, + "step": 23800 + }, + { + "epoch": 1.3977926499941293, + "grad_norm": 0.7073603868484497, + "learning_rate": 2.0864213283715927e-05, + "loss": 0.0063, + "step": 23810 + }, + { + "epoch": 1.3983797111659035, + "grad_norm": 0.9657589793205261, + "learning_rate": 2.0826644833886215e-05, + "loss": 0.0118, + "step": 23820 + }, + { + "epoch": 1.3989667723376775, + "grad_norm": 2.1152420043945312, + "learning_rate": 2.0789101338399485e-05, + "loss": 0.0181, + "step": 23830 + }, + { + "epoch": 1.3995538335094517, + "grad_norm": 0.03757241368293762, + "learning_rate": 2.075158282936975e-05, + "loss": 0.0104, + "step": 23840 + }, + { + "epoch": 1.4001408946812257, + "grad_norm": 0.20126187801361084, + "learning_rate": 2.0714089338889658e-05, + "loss": 0.0084, + "step": 23850 + }, + { + "epoch": 1.400727955853, + "grad_norm": 2.3680176734924316, + "learning_rate": 2.067662089903039e-05, + "loss": 0.0177, + "step": 23860 + }, + { + "epoch": 1.401315017024774, + "grad_norm": 0.948137640953064, + "learning_rate": 2.063917754184182e-05, + "loss": 0.0171, + "step": 23870 + }, + { + "epoch": 1.401902078196548, + "grad_norm": 3.399534225463867, + "learning_rate": 2.0601759299352246e-05, + "loss": 0.0115, + "step": 23880 + }, + { + "epoch": 1.4024891393683223, + "grad_norm": 0.6425442695617676, + "learning_rate": 2.056436620356857e-05, + "loss": 0.0189, + "step": 23890 + }, + { + "epoch": 1.4030762005400963, + "grad_norm": 1.1237084865570068, + "learning_rate": 2.05269982864761e-05, + "loss": 0.014, + "step": 23900 + }, + { + "epoch": 1.4036632617118703, + "grad_norm": 0.026308121159672737, + "learning_rate": 2.048965558003869e-05, + "loss": 0.0045, + "step": 23910 + }, + { + "epoch": 1.4042503228836445, + "grad_norm": 1.1034443378448486, + "learning_rate": 2.0452338116198576e-05, + "loss": 0.0079, + "step": 23920 + }, + { + "epoch": 1.4048373840554187, + "grad_norm": 0.3975376486778259, + "learning_rate": 2.041504592687645e-05, + "loss": 0.0129, + "step": 23930 + }, + { + "epoch": 1.4054244452271927, + "grad_norm": 0.2686179578304291, + "learning_rate": 2.037777904397132e-05, + "loss": 0.0125, + "step": 23940 + }, + { + "epoch": 1.4060115063989667, + "grad_norm": 1.6290830373764038, + "learning_rate": 2.03405374993606e-05, + "loss": 0.0107, + "step": 23950 + }, + { + "epoch": 1.4065985675707409, + "grad_norm": 1.4251973628997803, + "learning_rate": 2.0303321324899992e-05, + "loss": 0.0101, + "step": 23960 + }, + { + "epoch": 1.407185628742515, + "grad_norm": 1.4456449747085571, + "learning_rate": 2.026613055242353e-05, + "loss": 0.0024, + "step": 23970 + }, + { + "epoch": 1.407772689914289, + "grad_norm": 0.5611891746520996, + "learning_rate": 2.0228965213743506e-05, + "loss": 0.0071, + "step": 23980 + }, + { + "epoch": 1.408359751086063, + "grad_norm": 1.505138635635376, + "learning_rate": 2.019182534065045e-05, + "loss": 0.0197, + "step": 23990 + }, + { + "epoch": 1.4089468122578372, + "grad_norm": 1.6551071405410767, + "learning_rate": 2.0154710964913143e-05, + "loss": 0.0102, + "step": 24000 + }, + { + "epoch": 1.4089468122578372, + "eval_loss": 0.5137258768081665, + "eval_runtime": 269.7363, + "eval_samples_per_second": 3.503, + "eval_steps_per_second": 3.503, + "step": 24000 + }, + { + "epoch": 1.4095338734296115, + "grad_norm": 2.8127646446228027, + "learning_rate": 2.0117622118278484e-05, + "loss": 0.0196, + "step": 24010 + }, + { + "epoch": 1.4101209346013854, + "grad_norm": 1.3914660215377808, + "learning_rate": 2.0080558832471625e-05, + "loss": 0.0083, + "step": 24020 + }, + { + "epoch": 1.4107079957731596, + "grad_norm": 0.17611494660377502, + "learning_rate": 2.0043521139195763e-05, + "loss": 0.0113, + "step": 24030 + }, + { + "epoch": 1.4112950569449336, + "grad_norm": 3.4943318367004395, + "learning_rate": 2.000650907013228e-05, + "loss": 0.02, + "step": 24040 + }, + { + "epoch": 1.4118821181167078, + "grad_norm": 0.7097033858299255, + "learning_rate": 1.9969522656940593e-05, + "loss": 0.0058, + "step": 24050 + }, + { + "epoch": 1.4124691792884818, + "grad_norm": 1.8921847343444824, + "learning_rate": 1.9932561931258213e-05, + "loss": 0.0122, + "step": 24060 + }, + { + "epoch": 1.413056240460256, + "grad_norm": 2.1589457988739014, + "learning_rate": 1.9895626924700618e-05, + "loss": 0.0113, + "step": 24070 + }, + { + "epoch": 1.41364330163203, + "grad_norm": 2.850785255432129, + "learning_rate": 1.985871766886136e-05, + "loss": 0.0088, + "step": 24080 + }, + { + "epoch": 1.4142303628038042, + "grad_norm": 0.8964874148368835, + "learning_rate": 1.982183419531188e-05, + "loss": 0.0119, + "step": 24090 + }, + { + "epoch": 1.4148174239755782, + "grad_norm": 0.07366857677698135, + "learning_rate": 1.978497653560167e-05, + "loss": 0.0287, + "step": 24100 + }, + { + "epoch": 1.4154044851473524, + "grad_norm": 0.5417582988739014, + "learning_rate": 1.9748144721258033e-05, + "loss": 0.0166, + "step": 24110 + }, + { + "epoch": 1.4159915463191264, + "grad_norm": 2.048412799835205, + "learning_rate": 1.9711338783786237e-05, + "loss": 0.0111, + "step": 24120 + }, + { + "epoch": 1.4165786074909006, + "grad_norm": 0.4081864058971405, + "learning_rate": 1.9674558754669413e-05, + "loss": 0.0213, + "step": 24130 + }, + { + "epoch": 1.4171656686626746, + "grad_norm": 0.44963279366493225, + "learning_rate": 1.963780466536847e-05, + "loss": 0.0044, + "step": 24140 + }, + { + "epoch": 1.4177527298344488, + "grad_norm": 0.10618758946657181, + "learning_rate": 1.960107654732219e-05, + "loss": 0.01, + "step": 24150 + }, + { + "epoch": 1.4183397910062228, + "grad_norm": 0.6178563237190247, + "learning_rate": 1.956437443194712e-05, + "loss": 0.0128, + "step": 24160 + }, + { + "epoch": 1.418926852177997, + "grad_norm": 0.036462098360061646, + "learning_rate": 1.952769835063758e-05, + "loss": 0.0069, + "step": 24170 + }, + { + "epoch": 1.4195139133497712, + "grad_norm": 2.371694326400757, + "learning_rate": 1.9491048334765566e-05, + "loss": 0.0108, + "step": 24180 + }, + { + "epoch": 1.4201009745215452, + "grad_norm": 0.01989881508052349, + "learning_rate": 1.9454424415680857e-05, + "loss": 0.0157, + "step": 24190 + }, + { + "epoch": 1.4206880356933191, + "grad_norm": 3.062727212905884, + "learning_rate": 1.9417826624710834e-05, + "loss": 0.0164, + "step": 24200 + }, + { + "epoch": 1.4212750968650933, + "grad_norm": 0.6262339949607849, + "learning_rate": 1.938125499316058e-05, + "loss": 0.0061, + "step": 24210 + }, + { + "epoch": 1.4218621580368676, + "grad_norm": 0.24331939220428467, + "learning_rate": 1.9344709552312783e-05, + "loss": 0.0113, + "step": 24220 + }, + { + "epoch": 1.4224492192086415, + "grad_norm": 0.49620071053504944, + "learning_rate": 1.930819033342775e-05, + "loss": 0.0331, + "step": 24230 + }, + { + "epoch": 1.4230362803804155, + "grad_norm": 0.1770806908607483, + "learning_rate": 1.9271697367743304e-05, + "loss": 0.0181, + "step": 24240 + }, + { + "epoch": 1.4236233415521897, + "grad_norm": 1.1065704822540283, + "learning_rate": 1.9235230686474864e-05, + "loss": 0.027, + "step": 24250 + }, + { + "epoch": 1.424210402723964, + "grad_norm": 0.7962722182273865, + "learning_rate": 1.9198790320815347e-05, + "loss": 0.0088, + "step": 24260 + }, + { + "epoch": 1.424797463895738, + "grad_norm": 1.1967579126358032, + "learning_rate": 1.916237630193516e-05, + "loss": 0.0072, + "step": 24270 + }, + { + "epoch": 1.425384525067512, + "grad_norm": 1.5202269554138184, + "learning_rate": 1.912598866098219e-05, + "loss": 0.013, + "step": 24280 + }, + { + "epoch": 1.425971586239286, + "grad_norm": 0.05356645584106445, + "learning_rate": 1.908962742908172e-05, + "loss": 0.0116, + "step": 24290 + }, + { + "epoch": 1.4265586474110603, + "grad_norm": 0.032990165054798126, + "learning_rate": 1.905329263733649e-05, + "loss": 0.0238, + "step": 24300 + }, + { + "epoch": 1.4271457085828343, + "grad_norm": 0.04489932209253311, + "learning_rate": 1.901698431682658e-05, + "loss": 0.009, + "step": 24310 + }, + { + "epoch": 1.4277327697546085, + "grad_norm": 1.7742148637771606, + "learning_rate": 1.8980702498609453e-05, + "loss": 0.0228, + "step": 24320 + }, + { + "epoch": 1.4283198309263825, + "grad_norm": 0.05356493964791298, + "learning_rate": 1.8944447213719914e-05, + "loss": 0.0095, + "step": 24330 + }, + { + "epoch": 1.4289068920981567, + "grad_norm": 2.683162212371826, + "learning_rate": 1.890821849317006e-05, + "loss": 0.0114, + "step": 24340 + }, + { + "epoch": 1.4294939532699307, + "grad_norm": 1.9921892881393433, + "learning_rate": 1.8872016367949237e-05, + "loss": 0.007, + "step": 24350 + }, + { + "epoch": 1.4300810144417049, + "grad_norm": 1.2890311479568481, + "learning_rate": 1.883584086902409e-05, + "loss": 0.0161, + "step": 24360 + }, + { + "epoch": 1.4306680756134789, + "grad_norm": 0.2484143227338791, + "learning_rate": 1.8799692027338446e-05, + "loss": 0.0164, + "step": 24370 + }, + { + "epoch": 1.431255136785253, + "grad_norm": 0.40440618991851807, + "learning_rate": 1.8763569873813354e-05, + "loss": 0.014, + "step": 24380 + }, + { + "epoch": 1.431842197957027, + "grad_norm": 0.03348202630877495, + "learning_rate": 1.8727474439347027e-05, + "loss": 0.0099, + "step": 24390 + }, + { + "epoch": 1.4324292591288013, + "grad_norm": 1.390295386314392, + "learning_rate": 1.8691405754814833e-05, + "loss": 0.0052, + "step": 24400 + }, + { + "epoch": 1.4330163203005752, + "grad_norm": 0.5037054419517517, + "learning_rate": 1.865536385106927e-05, + "loss": 0.0062, + "step": 24410 + }, + { + "epoch": 1.4336033814723494, + "grad_norm": 0.09097137302160263, + "learning_rate": 1.861934875893987e-05, + "loss": 0.0213, + "step": 24420 + }, + { + "epoch": 1.4341904426441237, + "grad_norm": 0.8542649149894714, + "learning_rate": 1.85833605092333e-05, + "loss": 0.0187, + "step": 24430 + }, + { + "epoch": 1.4347775038158976, + "grad_norm": 0.8738613128662109, + "learning_rate": 1.8547399132733195e-05, + "loss": 0.0084, + "step": 24440 + }, + { + "epoch": 1.4353645649876716, + "grad_norm": 0.20260785520076752, + "learning_rate": 1.8511464660200307e-05, + "loss": 0.0037, + "step": 24450 + }, + { + "epoch": 1.4359516261594458, + "grad_norm": 0.8802362680435181, + "learning_rate": 1.847555712237226e-05, + "loss": 0.0075, + "step": 24460 + }, + { + "epoch": 1.43653868733122, + "grad_norm": 2.442185401916504, + "learning_rate": 1.8439676549963737e-05, + "loss": 0.0173, + "step": 24470 + }, + { + "epoch": 1.437125748502994, + "grad_norm": 0.18825408816337585, + "learning_rate": 1.840382297366626e-05, + "loss": 0.013, + "step": 24480 + }, + { + "epoch": 1.437712809674768, + "grad_norm": 0.2534600794315338, + "learning_rate": 1.8367996424148326e-05, + "loss": 0.0078, + "step": 24490 + }, + { + "epoch": 1.4382998708465422, + "grad_norm": 0.049735572189092636, + "learning_rate": 1.8332196932055305e-05, + "loss": 0.0186, + "step": 24500 + }, + { + "epoch": 1.4388869320183164, + "grad_norm": 1.9720195531845093, + "learning_rate": 1.8296424528009425e-05, + "loss": 0.0241, + "step": 24510 + }, + { + "epoch": 1.4394739931900904, + "grad_norm": 0.06996825337409973, + "learning_rate": 1.8260679242609703e-05, + "loss": 0.0098, + "step": 24520 + }, + { + "epoch": 1.4400610543618644, + "grad_norm": 2.268746852874756, + "learning_rate": 1.8224961106432003e-05, + "loss": 0.0163, + "step": 24530 + }, + { + "epoch": 1.4406481155336386, + "grad_norm": 0.08396535366773605, + "learning_rate": 1.818927015002897e-05, + "loss": 0.0168, + "step": 24540 + }, + { + "epoch": 1.4412351767054128, + "grad_norm": 1.3653311729431152, + "learning_rate": 1.815360640392994e-05, + "loss": 0.0047, + "step": 24550 + }, + { + "epoch": 1.4418222378771868, + "grad_norm": 0.1528441607952118, + "learning_rate": 1.8117969898641042e-05, + "loss": 0.0039, + "step": 24560 + }, + { + "epoch": 1.442409299048961, + "grad_norm": 0.49751341342926025, + "learning_rate": 1.8082360664645065e-05, + "loss": 0.0115, + "step": 24570 + }, + { + "epoch": 1.442996360220735, + "grad_norm": 1.538364291191101, + "learning_rate": 1.8046778732401513e-05, + "loss": 0.0111, + "step": 24580 + }, + { + "epoch": 1.4435834213925092, + "grad_norm": 1.63816237449646, + "learning_rate": 1.8011224132346466e-05, + "loss": 0.0119, + "step": 24590 + }, + { + "epoch": 1.4441704825642832, + "grad_norm": 0.10889382660388947, + "learning_rate": 1.7975696894892698e-05, + "loss": 0.0159, + "step": 24600 + }, + { + "epoch": 1.4447575437360574, + "grad_norm": 0.11068199574947357, + "learning_rate": 1.7940197050429492e-05, + "loss": 0.0055, + "step": 24610 + }, + { + "epoch": 1.4453446049078313, + "grad_norm": 3.096181869506836, + "learning_rate": 1.7904724629322817e-05, + "loss": 0.0193, + "step": 24620 + }, + { + "epoch": 1.4459316660796055, + "grad_norm": 0.008653839118778706, + "learning_rate": 1.7869279661915077e-05, + "loss": 0.0062, + "step": 24630 + }, + { + "epoch": 1.4465187272513795, + "grad_norm": 0.18575824797153473, + "learning_rate": 1.7833862178525267e-05, + "loss": 0.0068, + "step": 24640 + }, + { + "epoch": 1.4471057884231537, + "grad_norm": 1.3114022016525269, + "learning_rate": 1.77984722094488e-05, + "loss": 0.0125, + "step": 24650 + }, + { + "epoch": 1.4476928495949277, + "grad_norm": 0.4251237213611603, + "learning_rate": 1.776310978495762e-05, + "loss": 0.0183, + "step": 24660 + }, + { + "epoch": 1.448279910766702, + "grad_norm": 1.0775110721588135, + "learning_rate": 1.7727774935300078e-05, + "loss": 0.0099, + "step": 24670 + }, + { + "epoch": 1.448866971938476, + "grad_norm": 0.15549612045288086, + "learning_rate": 1.769246769070095e-05, + "loss": 0.0084, + "step": 24680 + }, + { + "epoch": 1.4494540331102501, + "grad_norm": 0.00905792135745287, + "learning_rate": 1.7657188081361402e-05, + "loss": 0.0075, + "step": 24690 + }, + { + "epoch": 1.450041094282024, + "grad_norm": 0.10242454707622528, + "learning_rate": 1.762193613745893e-05, + "loss": 0.0193, + "step": 24700 + }, + { + "epoch": 1.4506281554537983, + "grad_norm": 0.28730252385139465, + "learning_rate": 1.7586711889147407e-05, + "loss": 0.0103, + "step": 24710 + }, + { + "epoch": 1.4512152166255725, + "grad_norm": 0.29514777660369873, + "learning_rate": 1.7551515366556975e-05, + "loss": 0.0065, + "step": 24720 + }, + { + "epoch": 1.4518022777973465, + "grad_norm": 1.5189207792282104, + "learning_rate": 1.7516346599794092e-05, + "loss": 0.0145, + "step": 24730 + }, + { + "epoch": 1.4523893389691205, + "grad_norm": 0.1768386960029602, + "learning_rate": 1.748120561894147e-05, + "loss": 0.0096, + "step": 24740 + }, + { + "epoch": 1.4529764001408947, + "grad_norm": 2.094917058944702, + "learning_rate": 1.7446092454058066e-05, + "loss": 0.0223, + "step": 24750 + }, + { + "epoch": 1.453563461312669, + "grad_norm": 1.3558257818222046, + "learning_rate": 1.7411007135178987e-05, + "loss": 0.0319, + "step": 24760 + }, + { + "epoch": 1.4541505224844429, + "grad_norm": 0.13693387806415558, + "learning_rate": 1.7375949692315584e-05, + "loss": 0.0084, + "step": 24770 + }, + { + "epoch": 1.4547375836562169, + "grad_norm": 2.2927207946777344, + "learning_rate": 1.7340920155455327e-05, + "loss": 0.0136, + "step": 24780 + }, + { + "epoch": 1.455324644827991, + "grad_norm": 0.8705666065216064, + "learning_rate": 1.7305918554561824e-05, + "loss": 0.0227, + "step": 24790 + }, + { + "epoch": 1.4559117059997653, + "grad_norm": 1.1699771881103516, + "learning_rate": 1.72709449195748e-05, + "loss": 0.0098, + "step": 24800 + }, + { + "epoch": 1.4564987671715393, + "grad_norm": 0.04188217222690582, + "learning_rate": 1.7235999280410047e-05, + "loss": 0.0126, + "step": 24810 + }, + { + "epoch": 1.4570858283433132, + "grad_norm": 1.9177348613739014, + "learning_rate": 1.720108166695943e-05, + "loss": 0.0115, + "step": 24820 + }, + { + "epoch": 1.4576728895150874, + "grad_norm": 0.24380716681480408, + "learning_rate": 1.716619210909079e-05, + "loss": 0.0075, + "step": 24830 + }, + { + "epoch": 1.4582599506868617, + "grad_norm": 1.21640145778656, + "learning_rate": 1.7131330636648014e-05, + "loss": 0.0105, + "step": 24840 + }, + { + "epoch": 1.4588470118586356, + "grad_norm": 1.1568200588226318, + "learning_rate": 1.709649727945096e-05, + "loss": 0.011, + "step": 24850 + }, + { + "epoch": 1.4594340730304098, + "grad_norm": 2.6545119285583496, + "learning_rate": 1.7061692067295447e-05, + "loss": 0.0255, + "step": 24860 + }, + { + "epoch": 1.4600211342021838, + "grad_norm": 0.8861936330795288, + "learning_rate": 1.7026915029953168e-05, + "loss": 0.0162, + "step": 24870 + }, + { + "epoch": 1.460608195373958, + "grad_norm": 0.3077358901500702, + "learning_rate": 1.6992166197171787e-05, + "loss": 0.0199, + "step": 24880 + }, + { + "epoch": 1.461195256545732, + "grad_norm": 1.2515180110931396, + "learning_rate": 1.695744559867477e-05, + "loss": 0.0101, + "step": 24890 + }, + { + "epoch": 1.4617823177175062, + "grad_norm": 2.070460796356201, + "learning_rate": 1.692275326416149e-05, + "loss": 0.0188, + "step": 24900 + }, + { + "epoch": 1.4623693788892802, + "grad_norm": 1.6355761289596558, + "learning_rate": 1.6888089223307113e-05, + "loss": 0.0179, + "step": 24910 + }, + { + "epoch": 1.4629564400610544, + "grad_norm": 1.6046183109283447, + "learning_rate": 1.685345350576264e-05, + "loss": 0.0143, + "step": 24920 + }, + { + "epoch": 1.4635435012328284, + "grad_norm": 0.4053262770175934, + "learning_rate": 1.681884614115477e-05, + "loss": 0.0118, + "step": 24930 + }, + { + "epoch": 1.4641305624046026, + "grad_norm": 1.1307713985443115, + "learning_rate": 1.6784267159086026e-05, + "loss": 0.0169, + "step": 24940 + }, + { + "epoch": 1.4647176235763766, + "grad_norm": 0.4161185324192047, + "learning_rate": 1.6749716589134627e-05, + "loss": 0.0086, + "step": 24950 + }, + { + "epoch": 1.4653046847481508, + "grad_norm": 0.278931587934494, + "learning_rate": 1.6715194460854468e-05, + "loss": 0.0094, + "step": 24960 + }, + { + "epoch": 1.465891745919925, + "grad_norm": 3.9699208736419678, + "learning_rate": 1.6680700803775135e-05, + "loss": 0.0181, + "step": 24970 + }, + { + "epoch": 1.466478807091699, + "grad_norm": 0.42187848687171936, + "learning_rate": 1.6646235647401863e-05, + "loss": 0.0083, + "step": 24980 + }, + { + "epoch": 1.467065868263473, + "grad_norm": 2.352708101272583, + "learning_rate": 1.6611799021215525e-05, + "loss": 0.011, + "step": 24990 + }, + { + "epoch": 1.4676529294352472, + "grad_norm": 2.2313830852508545, + "learning_rate": 1.6577390954672523e-05, + "loss": 0.0093, + "step": 25000 + }, + { + "epoch": 1.4682399906070214, + "grad_norm": 0.010144324973225594, + "learning_rate": 1.6543011477204912e-05, + "loss": 0.007, + "step": 25010 + }, + { + "epoch": 1.4688270517787954, + "grad_norm": 0.9201246500015259, + "learning_rate": 1.650866061822021e-05, + "loss": 0.0086, + "step": 25020 + }, + { + "epoch": 1.4694141129505693, + "grad_norm": 0.18726426362991333, + "learning_rate": 1.6474338407101564e-05, + "loss": 0.0079, + "step": 25030 + }, + { + "epoch": 1.4700011741223435, + "grad_norm": 0.7087125778198242, + "learning_rate": 1.6440044873207494e-05, + "loss": 0.0071, + "step": 25040 + }, + { + "epoch": 1.4705882352941178, + "grad_norm": 1.518667459487915, + "learning_rate": 1.6405780045872092e-05, + "loss": 0.0035, + "step": 25050 + }, + { + "epoch": 1.4711752964658917, + "grad_norm": 3.9029223918914795, + "learning_rate": 1.637154395440482e-05, + "loss": 0.0125, + "step": 25060 + }, + { + "epoch": 1.4717623576376657, + "grad_norm": 0.14753930270671844, + "learning_rate": 1.63373366280906e-05, + "loss": 0.0036, + "step": 25070 + }, + { + "epoch": 1.47234941880944, + "grad_norm": 0.307205468416214, + "learning_rate": 1.6303158096189734e-05, + "loss": 0.0051, + "step": 25080 + }, + { + "epoch": 1.4729364799812141, + "grad_norm": 0.09061327576637268, + "learning_rate": 1.6269008387937917e-05, + "loss": 0.014, + "step": 25090 + }, + { + "epoch": 1.4735235411529881, + "grad_norm": 0.4797089099884033, + "learning_rate": 1.623488753254618e-05, + "loss": 0.0105, + "step": 25100 + }, + { + "epoch": 1.4741106023247623, + "grad_norm": 1.880612850189209, + "learning_rate": 1.620079555920082e-05, + "loss": 0.0132, + "step": 25110 + }, + { + "epoch": 1.4746976634965363, + "grad_norm": 0.2213641256093979, + "learning_rate": 1.6166732497063524e-05, + "loss": 0.0042, + "step": 25120 + }, + { + "epoch": 1.4752847246683105, + "grad_norm": 0.034901391714811325, + "learning_rate": 1.6132698375271164e-05, + "loss": 0.0098, + "step": 25130 + }, + { + "epoch": 1.4758717858400845, + "grad_norm": 0.5932609438896179, + "learning_rate": 1.60986932229359e-05, + "loss": 0.0191, + "step": 25140 + }, + { + "epoch": 1.4764588470118587, + "grad_norm": 2.2570817470550537, + "learning_rate": 1.6064717069145114e-05, + "loss": 0.0068, + "step": 25150 + }, + { + "epoch": 1.4770459081836327, + "grad_norm": 0.00515064038336277, + "learning_rate": 1.6030769942961378e-05, + "loss": 0.0106, + "step": 25160 + }, + { + "epoch": 1.4776329693554069, + "grad_norm": 0.6780362129211426, + "learning_rate": 1.5996851873422403e-05, + "loss": 0.0225, + "step": 25170 + }, + { + "epoch": 1.4782200305271809, + "grad_norm": 0.8630103468894958, + "learning_rate": 1.5962962889541105e-05, + "loss": 0.0052, + "step": 25180 + }, + { + "epoch": 1.478807091698955, + "grad_norm": 1.4598804712295532, + "learning_rate": 1.592910302030544e-05, + "loss": 0.0139, + "step": 25190 + }, + { + "epoch": 1.479394152870729, + "grad_norm": 0.2872070074081421, + "learning_rate": 1.589527229467857e-05, + "loss": 0.0128, + "step": 25200 + }, + { + "epoch": 1.4799812140425033, + "grad_norm": 0.012290052138268948, + "learning_rate": 1.5861470741598618e-05, + "loss": 0.0195, + "step": 25210 + }, + { + "epoch": 1.4805682752142773, + "grad_norm": 2.608578681945801, + "learning_rate": 1.582769838997882e-05, + "loss": 0.005, + "step": 25220 + }, + { + "epoch": 1.4811553363860515, + "grad_norm": 0.5421618223190308, + "learning_rate": 1.579395526870742e-05, + "loss": 0.0126, + "step": 25230 + }, + { + "epoch": 1.4817423975578254, + "grad_norm": 0.3466702401638031, + "learning_rate": 1.576024140664764e-05, + "loss": 0.0157, + "step": 25240 + }, + { + "epoch": 1.4823294587295996, + "grad_norm": 0.07040505111217499, + "learning_rate": 1.5726556832637686e-05, + "loss": 0.0057, + "step": 25250 + }, + { + "epoch": 1.4829165199013739, + "grad_norm": 0.9358566999435425, + "learning_rate": 1.5692901575490725e-05, + "loss": 0.0082, + "step": 25260 + }, + { + "epoch": 1.4835035810731478, + "grad_norm": 1.6973823308944702, + "learning_rate": 1.5659275663994842e-05, + "loss": 0.0191, + "step": 25270 + }, + { + "epoch": 1.4840906422449218, + "grad_norm": 0.9288491010665894, + "learning_rate": 1.562567912691299e-05, + "loss": 0.0254, + "step": 25280 + }, + { + "epoch": 1.484677703416696, + "grad_norm": 0.14372193813323975, + "learning_rate": 1.5592111992983042e-05, + "loss": 0.0084, + "step": 25290 + }, + { + "epoch": 1.4852647645884702, + "grad_norm": 0.3273877799510956, + "learning_rate": 1.5558574290917676e-05, + "loss": 0.0085, + "step": 25300 + }, + { + "epoch": 1.4858518257602442, + "grad_norm": 0.9547355771064758, + "learning_rate": 1.5525066049404425e-05, + "loss": 0.0205, + "step": 25310 + }, + { + "epoch": 1.4864388869320182, + "grad_norm": 0.22144687175750732, + "learning_rate": 1.5491587297105616e-05, + "loss": 0.0133, + "step": 25320 + }, + { + "epoch": 1.4870259481037924, + "grad_norm": 0.22204560041427612, + "learning_rate": 1.5458138062658362e-05, + "loss": 0.0088, + "step": 25330 + }, + { + "epoch": 1.4876130092755666, + "grad_norm": 2.982510805130005, + "learning_rate": 1.5424718374674478e-05, + "loss": 0.009, + "step": 25340 + }, + { + "epoch": 1.4882000704473406, + "grad_norm": 3.0312764644622803, + "learning_rate": 1.539132826174058e-05, + "loss": 0.0106, + "step": 25350 + }, + { + "epoch": 1.4887871316191146, + "grad_norm": 2.7994961738586426, + "learning_rate": 1.5357967752417908e-05, + "loss": 0.0096, + "step": 25360 + }, + { + "epoch": 1.4893741927908888, + "grad_norm": 1.3636599779129028, + "learning_rate": 1.5324636875242425e-05, + "loss": 0.006, + "step": 25370 + }, + { + "epoch": 1.489961253962663, + "grad_norm": 1.8163387775421143, + "learning_rate": 1.5291335658724787e-05, + "loss": 0.0137, + "step": 25380 + }, + { + "epoch": 1.490548315134437, + "grad_norm": 0.03518426790833473, + "learning_rate": 1.5258064131350175e-05, + "loss": 0.0134, + "step": 25390 + }, + { + "epoch": 1.4911353763062112, + "grad_norm": 2.0565054416656494, + "learning_rate": 1.522482232157848e-05, + "loss": 0.0084, + "step": 25400 + }, + { + "epoch": 1.4917224374779852, + "grad_norm": 0.9762770533561707, + "learning_rate": 1.519161025784408e-05, + "loss": 0.0046, + "step": 25410 + }, + { + "epoch": 1.4923094986497594, + "grad_norm": 0.5391456484794617, + "learning_rate": 1.5158427968555977e-05, + "loss": 0.0114, + "step": 25420 + }, + { + "epoch": 1.4928965598215334, + "grad_norm": 1.3391125202178955, + "learning_rate": 1.5125275482097678e-05, + "loss": 0.01, + "step": 25430 + }, + { + "epoch": 1.4934836209933076, + "grad_norm": 1.2342358827590942, + "learning_rate": 1.5092152826827216e-05, + "loss": 0.0178, + "step": 25440 + }, + { + "epoch": 1.4940706821650815, + "grad_norm": 0.8180807828903198, + "learning_rate": 1.5059060031077066e-05, + "loss": 0.004, + "step": 25450 + }, + { + "epoch": 1.4946577433368557, + "grad_norm": 0.21036776900291443, + "learning_rate": 1.5025997123154211e-05, + "loss": 0.0095, + "step": 25460 + }, + { + "epoch": 1.4952448045086297, + "grad_norm": 0.1561666578054428, + "learning_rate": 1.4992964131340014e-05, + "loss": 0.0156, + "step": 25470 + }, + { + "epoch": 1.495831865680404, + "grad_norm": 0.1823035627603531, + "learning_rate": 1.49599610838903e-05, + "loss": 0.0056, + "step": 25480 + }, + { + "epoch": 1.496418926852178, + "grad_norm": 2.063511610031128, + "learning_rate": 1.4926988009035258e-05, + "loss": 0.0224, + "step": 25490 + }, + { + "epoch": 1.4970059880239521, + "grad_norm": 0.5182443261146545, + "learning_rate": 1.4894044934979435e-05, + "loss": 0.009, + "step": 25500 + }, + { + "epoch": 1.4975930491957261, + "grad_norm": 0.39108607172966003, + "learning_rate": 1.4861131889901741e-05, + "loss": 0.0146, + "step": 25510 + }, + { + "epoch": 1.4981801103675003, + "grad_norm": 1.6948814392089844, + "learning_rate": 1.4828248901955349e-05, + "loss": 0.0073, + "step": 25520 + }, + { + "epoch": 1.4987671715392743, + "grad_norm": 1.4525011777877808, + "learning_rate": 1.4795395999267785e-05, + "loss": 0.0336, + "step": 25530 + }, + { + "epoch": 1.4993542327110485, + "grad_norm": 0.678019642829895, + "learning_rate": 1.4762573209940761e-05, + "loss": 0.0019, + "step": 25540 + }, + { + "epoch": 1.4999412938828227, + "grad_norm": 2.3262994289398193, + "learning_rate": 1.4729780562050333e-05, + "loss": 0.0154, + "step": 25550 + }, + { + "epoch": 1.5005283550545967, + "grad_norm": 1.7300747632980347, + "learning_rate": 1.469701808364668e-05, + "loss": 0.0068, + "step": 25560 + }, + { + "epoch": 1.5011154162263707, + "grad_norm": 0.633484959602356, + "learning_rate": 1.466428580275424e-05, + "loss": 0.0082, + "step": 25570 + }, + { + "epoch": 1.5017024773981449, + "grad_norm": 1.1872471570968628, + "learning_rate": 1.4631583747371568e-05, + "loss": 0.0154, + "step": 25580 + }, + { + "epoch": 1.502289538569919, + "grad_norm": 0.06257125735282898, + "learning_rate": 1.459891194547141e-05, + "loss": 0.003, + "step": 25590 + }, + { + "epoch": 1.502876599741693, + "grad_norm": 0.31022894382476807, + "learning_rate": 1.4566270425000605e-05, + "loss": 0.0063, + "step": 25600 + }, + { + "epoch": 1.503463660913467, + "grad_norm": 0.8578490614891052, + "learning_rate": 1.4533659213880124e-05, + "loss": 0.0115, + "step": 25610 + }, + { + "epoch": 1.5040507220852413, + "grad_norm": 0.19193144142627716, + "learning_rate": 1.4501078340004953e-05, + "loss": 0.0183, + "step": 25620 + }, + { + "epoch": 1.5046377832570155, + "grad_norm": 0.2767280042171478, + "learning_rate": 1.4468527831244188e-05, + "loss": 0.0096, + "step": 25630 + }, + { + "epoch": 1.5052248444287895, + "grad_norm": 1.4123883247375488, + "learning_rate": 1.4436007715440908e-05, + "loss": 0.0125, + "step": 25640 + }, + { + "epoch": 1.5058119056005634, + "grad_norm": 1.5527786016464233, + "learning_rate": 1.4403518020412221e-05, + "loss": 0.0068, + "step": 25650 + }, + { + "epoch": 1.5063989667723376, + "grad_norm": 1.0498160123825073, + "learning_rate": 1.4371058773949204e-05, + "loss": 0.0115, + "step": 25660 + }, + { + "epoch": 1.5069860279441119, + "grad_norm": 0.02221166342496872, + "learning_rate": 1.4338630003816889e-05, + "loss": 0.0126, + "step": 25670 + }, + { + "epoch": 1.5075730891158858, + "grad_norm": 1.3004329204559326, + "learning_rate": 1.430623173775426e-05, + "loss": 0.0104, + "step": 25680 + }, + { + "epoch": 1.5081601502876598, + "grad_norm": 1.0908926725387573, + "learning_rate": 1.4273864003474157e-05, + "loss": 0.0156, + "step": 25690 + }, + { + "epoch": 1.508747211459434, + "grad_norm": 2.631584882736206, + "learning_rate": 1.4241526828663366e-05, + "loss": 0.0122, + "step": 25700 + }, + { + "epoch": 1.5093342726312082, + "grad_norm": 0.7961341738700867, + "learning_rate": 1.4209220240982468e-05, + "loss": 0.009, + "step": 25710 + }, + { + "epoch": 1.5099213338029824, + "grad_norm": 1.766007423400879, + "learning_rate": 1.4176944268065928e-05, + "loss": 0.0054, + "step": 25720 + }, + { + "epoch": 1.5105083949747564, + "grad_norm": 1.0545703172683716, + "learning_rate": 1.4144698937522022e-05, + "loss": 0.0061, + "step": 25730 + }, + { + "epoch": 1.5110954561465304, + "grad_norm": 3.5069901943206787, + "learning_rate": 1.4112484276932808e-05, + "loss": 0.0075, + "step": 25740 + }, + { + "epoch": 1.5116825173183046, + "grad_norm": 0.42936596274375916, + "learning_rate": 1.4080300313854072e-05, + "loss": 0.0111, + "step": 25750 + }, + { + "epoch": 1.5122695784900788, + "grad_norm": 0.36388495564460754, + "learning_rate": 1.404814707581542e-05, + "loss": 0.0114, + "step": 25760 + }, + { + "epoch": 1.5128566396618528, + "grad_norm": 2.5523219108581543, + "learning_rate": 1.401602459032007e-05, + "loss": 0.0246, + "step": 25770 + }, + { + "epoch": 1.5134437008336268, + "grad_norm": 0.4812352657318115, + "learning_rate": 1.3983932884845046e-05, + "loss": 0.0157, + "step": 25780 + }, + { + "epoch": 1.514030762005401, + "grad_norm": 0.3735608458518982, + "learning_rate": 1.3951871986840997e-05, + "loss": 0.0203, + "step": 25790 + }, + { + "epoch": 1.5146178231771752, + "grad_norm": 1.1593714952468872, + "learning_rate": 1.3919841923732186e-05, + "loss": 0.0171, + "step": 25800 + }, + { + "epoch": 1.5152048843489492, + "grad_norm": 0.2563406825065613, + "learning_rate": 1.3887842722916555e-05, + "loss": 0.0152, + "step": 25810 + }, + { + "epoch": 1.5157919455207232, + "grad_norm": 1.1414568424224854, + "learning_rate": 1.3855874411765602e-05, + "loss": 0.0171, + "step": 25820 + }, + { + "epoch": 1.5163790066924974, + "grad_norm": 0.131794273853302, + "learning_rate": 1.3823937017624427e-05, + "loss": 0.0171, + "step": 25830 + }, + { + "epoch": 1.5169660678642716, + "grad_norm": 0.80946284532547, + "learning_rate": 1.3792030567811687e-05, + "loss": 0.0156, + "step": 25840 + }, + { + "epoch": 1.5175531290360456, + "grad_norm": 0.5427300930023193, + "learning_rate": 1.3760155089619575e-05, + "loss": 0.0089, + "step": 25850 + }, + { + "epoch": 1.5181401902078195, + "grad_norm": 1.4580811262130737, + "learning_rate": 1.3728310610313755e-05, + "loss": 0.0057, + "step": 25860 + }, + { + "epoch": 1.5187272513795937, + "grad_norm": 1.8565434217453003, + "learning_rate": 1.369649715713342e-05, + "loss": 0.0146, + "step": 25870 + }, + { + "epoch": 1.519314312551368, + "grad_norm": 5.821869850158691, + "learning_rate": 1.366471475729118e-05, + "loss": 0.0155, + "step": 25880 + }, + { + "epoch": 1.519901373723142, + "grad_norm": 0.17705050110816956, + "learning_rate": 1.3632963437973122e-05, + "loss": 0.0072, + "step": 25890 + }, + { + "epoch": 1.520488434894916, + "grad_norm": 1.7221649885177612, + "learning_rate": 1.3601243226338734e-05, + "loss": 0.0125, + "step": 25900 + }, + { + "epoch": 1.5210754960666901, + "grad_norm": 1.9367401599884033, + "learning_rate": 1.3569554149520886e-05, + "loss": 0.0069, + "step": 25910 + }, + { + "epoch": 1.5216625572384643, + "grad_norm": 0.06342757493257523, + "learning_rate": 1.3537896234625835e-05, + "loss": 0.0084, + "step": 25920 + }, + { + "epoch": 1.5222496184102383, + "grad_norm": 0.30179139971733093, + "learning_rate": 1.350626950873315e-05, + "loss": 0.0094, + "step": 25930 + }, + { + "epoch": 1.5228366795820123, + "grad_norm": 2.0123047828674316, + "learning_rate": 1.3474673998895764e-05, + "loss": 0.0165, + "step": 25940 + }, + { + "epoch": 1.5234237407537865, + "grad_norm": 0.5998191833496094, + "learning_rate": 1.3443109732139841e-05, + "loss": 0.0136, + "step": 25950 + }, + { + "epoch": 1.5240108019255607, + "grad_norm": 0.24649828672409058, + "learning_rate": 1.3411576735464925e-05, + "loss": 0.0187, + "step": 25960 + }, + { + "epoch": 1.5245978630973347, + "grad_norm": 0.7969292998313904, + "learning_rate": 1.3380075035843714e-05, + "loss": 0.0121, + "step": 25970 + }, + { + "epoch": 1.5251849242691087, + "grad_norm": 1.5419803857803345, + "learning_rate": 1.3348604660222198e-05, + "loss": 0.0135, + "step": 25980 + }, + { + "epoch": 1.5257719854408829, + "grad_norm": 0.00748492730781436, + "learning_rate": 1.3317165635519518e-05, + "loss": 0.0103, + "step": 25990 + }, + { + "epoch": 1.526359046612657, + "grad_norm": 0.43452805280685425, + "learning_rate": 1.3285757988628045e-05, + "loss": 0.0115, + "step": 26000 + }, + { + "epoch": 1.5269461077844313, + "grad_norm": 0.8184288740158081, + "learning_rate": 1.3254381746413291e-05, + "loss": 0.0067, + "step": 26010 + }, + { + "epoch": 1.5275331689562053, + "grad_norm": 0.6614131331443787, + "learning_rate": 1.3223036935713923e-05, + "loss": 0.0078, + "step": 26020 + }, + { + "epoch": 1.5281202301279793, + "grad_norm": 0.08678902685642242, + "learning_rate": 1.3191723583341681e-05, + "loss": 0.0125, + "step": 26030 + }, + { + "epoch": 1.5287072912997535, + "grad_norm": 0.2708864212036133, + "learning_rate": 1.3160441716081446e-05, + "loss": 0.007, + "step": 26040 + }, + { + "epoch": 1.5292943524715277, + "grad_norm": 0.5324260592460632, + "learning_rate": 1.3129191360691112e-05, + "loss": 0.0231, + "step": 26050 + }, + { + "epoch": 1.5298814136433017, + "grad_norm": 1.7831233739852905, + "learning_rate": 1.309797254390167e-05, + "loss": 0.0121, + "step": 26060 + }, + { + "epoch": 1.5304684748150756, + "grad_norm": 0.4662072956562042, + "learning_rate": 1.306678529241711e-05, + "loss": 0.006, + "step": 26070 + }, + { + "epoch": 1.5310555359868498, + "grad_norm": 1.2265037298202515, + "learning_rate": 1.3035629632914426e-05, + "loss": 0.0073, + "step": 26080 + }, + { + "epoch": 1.531642597158624, + "grad_norm": 1.3609946966171265, + "learning_rate": 1.3004505592043598e-05, + "loss": 0.024, + "step": 26090 + }, + { + "epoch": 1.532229658330398, + "grad_norm": 0.26812970638275146, + "learning_rate": 1.2973413196427519e-05, + "loss": 0.0099, + "step": 26100 + }, + { + "epoch": 1.532816719502172, + "grad_norm": 0.2851371467113495, + "learning_rate": 1.2942352472662078e-05, + "loss": 0.0066, + "step": 26110 + }, + { + "epoch": 1.5334037806739462, + "grad_norm": 0.5341777801513672, + "learning_rate": 1.2911323447315993e-05, + "loss": 0.0081, + "step": 26120 + }, + { + "epoch": 1.5339908418457204, + "grad_norm": 0.4923510253429413, + "learning_rate": 1.288032614693097e-05, + "loss": 0.0149, + "step": 26130 + }, + { + "epoch": 1.5345779030174944, + "grad_norm": 1.253936767578125, + "learning_rate": 1.2849360598021471e-05, + "loss": 0.0217, + "step": 26140 + }, + { + "epoch": 1.5351649641892684, + "grad_norm": 0.6040740609169006, + "learning_rate": 1.2818426827074886e-05, + "loss": 0.0151, + "step": 26150 + }, + { + "epoch": 1.5357520253610426, + "grad_norm": 1.0527094602584839, + "learning_rate": 1.2787524860551352e-05, + "loss": 0.0109, + "step": 26160 + }, + { + "epoch": 1.5363390865328168, + "grad_norm": 1.06759512424469, + "learning_rate": 1.2756654724883849e-05, + "loss": 0.0146, + "step": 26170 + }, + { + "epoch": 1.5369261477045908, + "grad_norm": 0.2116648107767105, + "learning_rate": 1.2725816446478112e-05, + "loss": 0.0165, + "step": 26180 + }, + { + "epoch": 1.5375132088763648, + "grad_norm": 1.721382975578308, + "learning_rate": 1.2695010051712625e-05, + "loss": 0.0075, + "step": 26190 + }, + { + "epoch": 1.538100270048139, + "grad_norm": 0.4924405515193939, + "learning_rate": 1.2664235566938632e-05, + "loss": 0.0054, + "step": 26200 + }, + { + "epoch": 1.5386873312199132, + "grad_norm": 0.15361934900283813, + "learning_rate": 1.2633493018480009e-05, + "loss": 0.0089, + "step": 26210 + }, + { + "epoch": 1.5392743923916872, + "grad_norm": 0.08735856413841248, + "learning_rate": 1.2602782432633387e-05, + "loss": 0.0065, + "step": 26220 + }, + { + "epoch": 1.5398614535634612, + "grad_norm": 1.3857438564300537, + "learning_rate": 1.2572103835668004e-05, + "loss": 0.0225, + "step": 26230 + }, + { + "epoch": 1.5404485147352354, + "grad_norm": 0.17827914655208588, + "learning_rate": 1.2541457253825773e-05, + "loss": 0.01, + "step": 26240 + }, + { + "epoch": 1.5410355759070096, + "grad_norm": 0.9525066614151001, + "learning_rate": 1.2510842713321208e-05, + "loss": 0.0094, + "step": 26250 + }, + { + "epoch": 1.5416226370787838, + "grad_norm": 0.39047691226005554, + "learning_rate": 1.248026024034143e-05, + "loss": 0.0144, + "step": 26260 + }, + { + "epoch": 1.5422096982505578, + "grad_norm": 0.30978187918663025, + "learning_rate": 1.2449709861046077e-05, + "loss": 0.0208, + "step": 26270 + }, + { + "epoch": 1.5427967594223317, + "grad_norm": 0.14176547527313232, + "learning_rate": 1.2419191601567409e-05, + "loss": 0.0139, + "step": 26280 + }, + { + "epoch": 1.543383820594106, + "grad_norm": 0.9704849123954773, + "learning_rate": 1.238870548801015e-05, + "loss": 0.0057, + "step": 26290 + }, + { + "epoch": 1.5439708817658802, + "grad_norm": 0.018119262531399727, + "learning_rate": 1.235825154645156e-05, + "loss": 0.0054, + "step": 26300 + }, + { + "epoch": 1.5445579429376541, + "grad_norm": 0.06399316340684891, + "learning_rate": 1.232782980294137e-05, + "loss": 0.0195, + "step": 26310 + }, + { + "epoch": 1.5451450041094281, + "grad_norm": 1.6416302919387817, + "learning_rate": 1.2297440283501793e-05, + "loss": 0.0067, + "step": 26320 + }, + { + "epoch": 1.5457320652812023, + "grad_norm": 0.5732679963111877, + "learning_rate": 1.2267083014127424e-05, + "loss": 0.0145, + "step": 26330 + }, + { + "epoch": 1.5463191264529765, + "grad_norm": 0.017688609659671783, + "learning_rate": 1.2236758020785316e-05, + "loss": 0.0137, + "step": 26340 + }, + { + "epoch": 1.5469061876247505, + "grad_norm": 0.17886310815811157, + "learning_rate": 1.2206465329414901e-05, + "loss": 0.0088, + "step": 26350 + }, + { + "epoch": 1.5474932487965245, + "grad_norm": 0.06556381285190582, + "learning_rate": 1.217620496592799e-05, + "loss": 0.0111, + "step": 26360 + }, + { + "epoch": 1.5480803099682987, + "grad_norm": 0.24476167559623718, + "learning_rate": 1.2145976956208738e-05, + "loss": 0.0127, + "step": 26370 + }, + { + "epoch": 1.548667371140073, + "grad_norm": 0.030447032302618027, + "learning_rate": 1.211578132611359e-05, + "loss": 0.0161, + "step": 26380 + }, + { + "epoch": 1.549254432311847, + "grad_norm": 0.9274646043777466, + "learning_rate": 1.2085618101471363e-05, + "loss": 0.0102, + "step": 26390 + }, + { + "epoch": 1.5498414934836209, + "grad_norm": 1.1633837223052979, + "learning_rate": 1.205548730808308e-05, + "loss": 0.0079, + "step": 26400 + }, + { + "epoch": 1.550428554655395, + "grad_norm": 2.194584846496582, + "learning_rate": 1.2025388971722068e-05, + "loss": 0.0159, + "step": 26410 + }, + { + "epoch": 1.5510156158271693, + "grad_norm": 3.140812397003174, + "learning_rate": 1.1995323118133894e-05, + "loss": 0.0096, + "step": 26420 + }, + { + "epoch": 1.5516026769989433, + "grad_norm": 2.263437509536743, + "learning_rate": 1.196528977303633e-05, + "loss": 0.0164, + "step": 26430 + }, + { + "epoch": 1.5521897381707173, + "grad_norm": 0.17677821218967438, + "learning_rate": 1.1935288962119317e-05, + "loss": 0.0059, + "step": 26440 + }, + { + "epoch": 1.5527767993424915, + "grad_norm": 1.0963406562805176, + "learning_rate": 1.190532071104502e-05, + "loss": 0.0071, + "step": 26450 + }, + { + "epoch": 1.5533638605142657, + "grad_norm": 0.3778134882450104, + "learning_rate": 1.1875385045447679e-05, + "loss": 0.0072, + "step": 26460 + }, + { + "epoch": 1.5539509216860397, + "grad_norm": 0.20560497045516968, + "learning_rate": 1.1845481990933716e-05, + "loss": 0.0074, + "step": 26470 + }, + { + "epoch": 1.5545379828578136, + "grad_norm": 0.2553133964538574, + "learning_rate": 1.1815611573081681e-05, + "loss": 0.0086, + "step": 26480 + }, + { + "epoch": 1.5551250440295878, + "grad_norm": 0.2380114644765854, + "learning_rate": 1.1785773817442137e-05, + "loss": 0.0108, + "step": 26490 + }, + { + "epoch": 1.555712105201362, + "grad_norm": 0.011128624901175499, + "learning_rate": 1.1755968749537754e-05, + "loss": 0.0047, + "step": 26500 + }, + { + "epoch": 1.556299166373136, + "grad_norm": 1.5508345365524292, + "learning_rate": 1.172619639486322e-05, + "loss": 0.0313, + "step": 26510 + }, + { + "epoch": 1.55688622754491, + "grad_norm": 0.7508226633071899, + "learning_rate": 1.1696456778885262e-05, + "loss": 0.0278, + "step": 26520 + }, + { + "epoch": 1.5574732887166842, + "grad_norm": 0.5488082766532898, + "learning_rate": 1.166674992704258e-05, + "loss": 0.0208, + "step": 26530 + }, + { + "epoch": 1.5580603498884584, + "grad_norm": 1.9381704330444336, + "learning_rate": 1.163707586474589e-05, + "loss": 0.0197, + "step": 26540 + }, + { + "epoch": 1.5586474110602326, + "grad_norm": 1.983081340789795, + "learning_rate": 1.1607434617377788e-05, + "loss": 0.0111, + "step": 26550 + }, + { + "epoch": 1.5592344722320066, + "grad_norm": 0.04777294397354126, + "learning_rate": 1.157782621029288e-05, + "loss": 0.0108, + "step": 26560 + }, + { + "epoch": 1.5598215334037806, + "grad_norm": 0.15073652565479279, + "learning_rate": 1.1548250668817612e-05, + "loss": 0.0103, + "step": 26570 + }, + { + "epoch": 1.5604085945755548, + "grad_norm": 2.098924398422241, + "learning_rate": 1.1518708018250369e-05, + "loss": 0.0057, + "step": 26580 + }, + { + "epoch": 1.560995655747329, + "grad_norm": 0.07834240049123764, + "learning_rate": 1.148919828386138e-05, + "loss": 0.0047, + "step": 26590 + }, + { + "epoch": 1.561582716919103, + "grad_norm": 2.849283456802368, + "learning_rate": 1.1459721490892732e-05, + "loss": 0.011, + "step": 26600 + }, + { + "epoch": 1.562169778090877, + "grad_norm": 1.0505820512771606, + "learning_rate": 1.1430277664558298e-05, + "loss": 0.0088, + "step": 26610 + }, + { + "epoch": 1.5627568392626512, + "grad_norm": 5.632779598236084, + "learning_rate": 1.1400866830043789e-05, + "loss": 0.0185, + "step": 26620 + }, + { + "epoch": 1.5633439004344254, + "grad_norm": 0.5144931674003601, + "learning_rate": 1.1371489012506698e-05, + "loss": 0.0077, + "step": 26630 + }, + { + "epoch": 1.5639309616061994, + "grad_norm": 1.3698240518569946, + "learning_rate": 1.1342144237076236e-05, + "loss": 0.0123, + "step": 26640 + }, + { + "epoch": 1.5645180227779734, + "grad_norm": 0.12052831053733826, + "learning_rate": 1.131283252885338e-05, + "loss": 0.006, + "step": 26650 + }, + { + "epoch": 1.5651050839497476, + "grad_norm": 0.18448656797409058, + "learning_rate": 1.1283553912910833e-05, + "loss": 0.0124, + "step": 26660 + }, + { + "epoch": 1.5656921451215218, + "grad_norm": 1.7134002447128296, + "learning_rate": 1.1254308414292975e-05, + "loss": 0.0027, + "step": 26670 + }, + { + "epoch": 1.5662792062932958, + "grad_norm": 1.3325996398925781, + "learning_rate": 1.1225096058015844e-05, + "loss": 0.0113, + "step": 26680 + }, + { + "epoch": 1.5668662674650697, + "grad_norm": 2.014275074005127, + "learning_rate": 1.1195916869067159e-05, + "loss": 0.0119, + "step": 26690 + }, + { + "epoch": 1.567453328636844, + "grad_norm": 0.4575527310371399, + "learning_rate": 1.1166770872406223e-05, + "loss": 0.0121, + "step": 26700 + }, + { + "epoch": 1.5680403898086182, + "grad_norm": 2.249737501144409, + "learning_rate": 1.1137658092964026e-05, + "loss": 0.0115, + "step": 26710 + }, + { + "epoch": 1.5686274509803921, + "grad_norm": 0.14120075106620789, + "learning_rate": 1.1108578555643056e-05, + "loss": 0.0219, + "step": 26720 + }, + { + "epoch": 1.5692145121521661, + "grad_norm": 1.2550865411758423, + "learning_rate": 1.1079532285317435e-05, + "loss": 0.0047, + "step": 26730 + }, + { + "epoch": 1.5698015733239403, + "grad_norm": 2.1121370792388916, + "learning_rate": 1.1050519306832768e-05, + "loss": 0.0133, + "step": 26740 + }, + { + "epoch": 1.5703886344957145, + "grad_norm": 1.764227032661438, + "learning_rate": 1.1021539645006229e-05, + "loss": 0.0103, + "step": 26750 + }, + { + "epoch": 1.5709756956674885, + "grad_norm": 0.009054204449057579, + "learning_rate": 1.0992593324626488e-05, + "loss": 0.0108, + "step": 26760 + }, + { + "epoch": 1.5715627568392625, + "grad_norm": 1.0275087356567383, + "learning_rate": 1.0963680370453678e-05, + "loss": 0.0069, + "step": 26770 + }, + { + "epoch": 1.5721498180110367, + "grad_norm": 1.6089822053909302, + "learning_rate": 1.0934800807219415e-05, + "loss": 0.0104, + "step": 26780 + }, + { + "epoch": 1.572736879182811, + "grad_norm": 2.2903213500976562, + "learning_rate": 1.090595465962671e-05, + "loss": 0.0386, + "step": 26790 + }, + { + "epoch": 1.573323940354585, + "grad_norm": 1.5878002643585205, + "learning_rate": 1.0877141952350046e-05, + "loss": 0.0048, + "step": 26800 + }, + { + "epoch": 1.573911001526359, + "grad_norm": 1.2515116930007935, + "learning_rate": 1.0848362710035253e-05, + "loss": 0.0077, + "step": 26810 + }, + { + "epoch": 1.574498062698133, + "grad_norm": 0.23582957684993744, + "learning_rate": 1.0819616957299567e-05, + "loss": 0.0118, + "step": 26820 + }, + { + "epoch": 1.5750851238699073, + "grad_norm": 1.5625358819961548, + "learning_rate": 1.0790904718731565e-05, + "loss": 0.0165, + "step": 26830 + }, + { + "epoch": 1.5756721850416815, + "grad_norm": 2.9153378009796143, + "learning_rate": 1.0762226018891175e-05, + "loss": 0.0073, + "step": 26840 + }, + { + "epoch": 1.5762592462134555, + "grad_norm": 0.6501205563545227, + "learning_rate": 1.0733580882309591e-05, + "loss": 0.0176, + "step": 26850 + }, + { + "epoch": 1.5768463073852295, + "grad_norm": 1.5556343793869019, + "learning_rate": 1.0704969333489362e-05, + "loss": 0.0057, + "step": 26860 + }, + { + "epoch": 1.5774333685570037, + "grad_norm": 0.45732811093330383, + "learning_rate": 1.0676391396904229e-05, + "loss": 0.005, + "step": 26870 + }, + { + "epoch": 1.5780204297287779, + "grad_norm": 0.17563748359680176, + "learning_rate": 1.0647847096999276e-05, + "loss": 0.0117, + "step": 26880 + }, + { + "epoch": 1.5786074909005519, + "grad_norm": 1.6784812211990356, + "learning_rate": 1.0619336458190726e-05, + "loss": 0.0228, + "step": 26890 + }, + { + "epoch": 1.5791945520723258, + "grad_norm": 1.1329185962677002, + "learning_rate": 1.0590859504866058e-05, + "loss": 0.0052, + "step": 26900 + }, + { + "epoch": 1.5797816132441, + "grad_norm": 1.3960808515548706, + "learning_rate": 1.0562416261383945e-05, + "loss": 0.017, + "step": 26910 + }, + { + "epoch": 1.5803686744158743, + "grad_norm": 2.32549786567688, + "learning_rate": 1.0534006752074171e-05, + "loss": 0.0096, + "step": 26920 + }, + { + "epoch": 1.5809557355876482, + "grad_norm": 2.072798490524292, + "learning_rate": 1.050563100123772e-05, + "loss": 0.0111, + "step": 26930 + }, + { + "epoch": 1.5815427967594222, + "grad_norm": 0.689721941947937, + "learning_rate": 1.0477289033146675e-05, + "loss": 0.007, + "step": 26940 + }, + { + "epoch": 1.5821298579311964, + "grad_norm": 0.16267193853855133, + "learning_rate": 1.0448980872044239e-05, + "loss": 0.0038, + "step": 26950 + }, + { + "epoch": 1.5827169191029706, + "grad_norm": 0.16729170083999634, + "learning_rate": 1.0420706542144664e-05, + "loss": 0.0032, + "step": 26960 + }, + { + "epoch": 1.5833039802747446, + "grad_norm": 1.0334956645965576, + "learning_rate": 1.03924660676333e-05, + "loss": 0.0188, + "step": 26970 + }, + { + "epoch": 1.5838910414465186, + "grad_norm": 1.1257108449935913, + "learning_rate": 1.0364259472666504e-05, + "loss": 0.0032, + "step": 26980 + }, + { + "epoch": 1.5844781026182928, + "grad_norm": 1.1537364721298218, + "learning_rate": 1.0336086781371679e-05, + "loss": 0.0074, + "step": 26990 + }, + { + "epoch": 1.585065163790067, + "grad_norm": 0.2987788915634155, + "learning_rate": 1.030794801784722e-05, + "loss": 0.009, + "step": 27000 + }, + { + "epoch": 1.585065163790067, + "eval_loss": 0.5134320259094238, + "eval_runtime": 269.7262, + "eval_samples_per_second": 3.504, + "eval_steps_per_second": 3.504, + "step": 27000 + }, + { + "epoch": 1.585652224961841, + "grad_norm": 2.068195104598999, + "learning_rate": 1.0279843206162509e-05, + "loss": 0.0073, + "step": 27010 + }, + { + "epoch": 1.586239286133615, + "grad_norm": 0.1688322126865387, + "learning_rate": 1.0251772370357854e-05, + "loss": 0.0084, + "step": 27020 + }, + { + "epoch": 1.5868263473053892, + "grad_norm": 0.8966720700263977, + "learning_rate": 1.022373553444454e-05, + "loss": 0.0047, + "step": 27030 + }, + { + "epoch": 1.5874134084771634, + "grad_norm": 0.21050378680229187, + "learning_rate": 1.019573272240476e-05, + "loss": 0.0067, + "step": 27040 + }, + { + "epoch": 1.5880004696489374, + "grad_norm": 0.38126567006111145, + "learning_rate": 1.0167763958191556e-05, + "loss": 0.0047, + "step": 27050 + }, + { + "epoch": 1.5885875308207114, + "grad_norm": 0.613869309425354, + "learning_rate": 1.013982926572895e-05, + "loss": 0.0052, + "step": 27060 + }, + { + "epoch": 1.5891745919924856, + "grad_norm": 0.20027974247932434, + "learning_rate": 1.0111928668911702e-05, + "loss": 0.0135, + "step": 27070 + }, + { + "epoch": 1.5897616531642598, + "grad_norm": 1.9748679399490356, + "learning_rate": 1.0084062191605498e-05, + "loss": 0.0102, + "step": 27080 + }, + { + "epoch": 1.590348714336034, + "grad_norm": 0.007248531095683575, + "learning_rate": 1.0056229857646771e-05, + "loss": 0.0036, + "step": 27090 + }, + { + "epoch": 1.590935775507808, + "grad_norm": 0.06447096914052963, + "learning_rate": 1.0028431690842793e-05, + "loss": 0.0135, + "step": 27100 + }, + { + "epoch": 1.591522836679582, + "grad_norm": 0.022004619240760803, + "learning_rate": 1.00006677149716e-05, + "loss": 0.005, + "step": 27110 + }, + { + "epoch": 1.5921098978513561, + "grad_norm": 1.1582072973251343, + "learning_rate": 9.972937953781986e-06, + "loss": 0.0196, + "step": 27120 + }, + { + "epoch": 1.5926969590231304, + "grad_norm": 0.04341103509068489, + "learning_rate": 9.945242430993446e-06, + "loss": 0.0064, + "step": 27130 + }, + { + "epoch": 1.5932840201949043, + "grad_norm": 1.97472083568573, + "learning_rate": 9.917581170296241e-06, + "loss": 0.0096, + "step": 27140 + }, + { + "epoch": 1.5938710813666783, + "grad_norm": 0.1625327169895172, + "learning_rate": 9.889954195351276e-06, + "loss": 0.0107, + "step": 27150 + }, + { + "epoch": 1.5944581425384525, + "grad_norm": 0.14234226942062378, + "learning_rate": 9.862361529790149e-06, + "loss": 0.0049, + "step": 27160 + }, + { + "epoch": 1.5950452037102267, + "grad_norm": 0.9036107063293457, + "learning_rate": 9.83480319721512e-06, + "loss": 0.0255, + "step": 27170 + }, + { + "epoch": 1.5956322648820007, + "grad_norm": 0.05637304484844208, + "learning_rate": 9.807279221199067e-06, + "loss": 0.0081, + "step": 27180 + }, + { + "epoch": 1.5962193260537747, + "grad_norm": 0.07056168466806412, + "learning_rate": 9.7797896252855e-06, + "loss": 0.0013, + "step": 27190 + }, + { + "epoch": 1.596806387225549, + "grad_norm": 0.43077000975608826, + "learning_rate": 9.752334432988485e-06, + "loss": 0.018, + "step": 27200 + }, + { + "epoch": 1.5973934483973231, + "grad_norm": 2.8605754375457764, + "learning_rate": 9.724913667792696e-06, + "loss": 0.0121, + "step": 27210 + }, + { + "epoch": 1.597980509569097, + "grad_norm": 0.9835911393165588, + "learning_rate": 9.69752735315333e-06, + "loss": 0.0054, + "step": 27220 + }, + { + "epoch": 1.598567570740871, + "grad_norm": 0.20682783424854279, + "learning_rate": 9.670175512496127e-06, + "loss": 0.0092, + "step": 27230 + }, + { + "epoch": 1.5991546319126453, + "grad_norm": 3.0163230895996094, + "learning_rate": 9.642858169217356e-06, + "loss": 0.0055, + "step": 27240 + }, + { + "epoch": 1.5997416930844195, + "grad_norm": 0.4688768982887268, + "learning_rate": 9.615575346683758e-06, + "loss": 0.0058, + "step": 27250 + }, + { + "epoch": 1.6003287542561935, + "grad_norm": 1.2958694696426392, + "learning_rate": 9.588327068232539e-06, + "loss": 0.0105, + "step": 27260 + }, + { + "epoch": 1.6009158154279675, + "grad_norm": 0.6211140751838684, + "learning_rate": 9.561113357171386e-06, + "loss": 0.004, + "step": 27270 + }, + { + "epoch": 1.6015028765997417, + "grad_norm": 0.8817264437675476, + "learning_rate": 9.533934236778364e-06, + "loss": 0.0069, + "step": 27280 + }, + { + "epoch": 1.6020899377715159, + "grad_norm": 2.1518731117248535, + "learning_rate": 9.506789730302034e-06, + "loss": 0.0185, + "step": 27290 + }, + { + "epoch": 1.6026769989432899, + "grad_norm": 0.5215532779693604, + "learning_rate": 9.47967986096126e-06, + "loss": 0.0059, + "step": 27300 + }, + { + "epoch": 1.6032640601150638, + "grad_norm": 1.0806931257247925, + "learning_rate": 9.45260465194533e-06, + "loss": 0.0105, + "step": 27310 + }, + { + "epoch": 1.603851121286838, + "grad_norm": 0.0196642205119133, + "learning_rate": 9.425564126413889e-06, + "loss": 0.01, + "step": 27320 + }, + { + "epoch": 1.6044381824586122, + "grad_norm": 0.8702532052993774, + "learning_rate": 9.398558307496868e-06, + "loss": 0.0118, + "step": 27330 + }, + { + "epoch": 1.6050252436303862, + "grad_norm": 0.4726807177066803, + "learning_rate": 9.37158721829456e-06, + "loss": 0.0089, + "step": 27340 + }, + { + "epoch": 1.6056123048021604, + "grad_norm": 1.2431391477584839, + "learning_rate": 9.344650881877515e-06, + "loss": 0.0097, + "step": 27350 + }, + { + "epoch": 1.6061993659739344, + "grad_norm": 0.019758054986596107, + "learning_rate": 9.317749321286601e-06, + "loss": 0.0048, + "step": 27360 + }, + { + "epoch": 1.6067864271457086, + "grad_norm": 0.9880382418632507, + "learning_rate": 9.290882559532877e-06, + "loss": 0.0176, + "step": 27370 + }, + { + "epoch": 1.6073734883174828, + "grad_norm": 1.799951434135437, + "learning_rate": 9.264050619597697e-06, + "loss": 0.0076, + "step": 27380 + }, + { + "epoch": 1.6079605494892568, + "grad_norm": 1.8501851558685303, + "learning_rate": 9.23725352443257e-06, + "loss": 0.0118, + "step": 27390 + }, + { + "epoch": 1.6085476106610308, + "grad_norm": 1.811416745185852, + "learning_rate": 9.210491296959256e-06, + "loss": 0.0201, + "step": 27400 + }, + { + "epoch": 1.609134671832805, + "grad_norm": 0.3321267366409302, + "learning_rate": 9.183763960069652e-06, + "loss": 0.0119, + "step": 27410 + }, + { + "epoch": 1.6097217330045792, + "grad_norm": 1.2952860593795776, + "learning_rate": 9.157071536625838e-06, + "loss": 0.0061, + "step": 27420 + }, + { + "epoch": 1.6103087941763532, + "grad_norm": 1.584791660308838, + "learning_rate": 9.130414049459995e-06, + "loss": 0.0125, + "step": 27430 + }, + { + "epoch": 1.6108958553481272, + "grad_norm": 0.06565705686807632, + "learning_rate": 9.103791521374444e-06, + "loss": 0.0113, + "step": 27440 + }, + { + "epoch": 1.6114829165199014, + "grad_norm": 0.5617465972900391, + "learning_rate": 9.077203975141607e-06, + "loss": 0.0099, + "step": 27450 + }, + { + "epoch": 1.6120699776916756, + "grad_norm": 1.095957636833191, + "learning_rate": 9.050651433503965e-06, + "loss": 0.0071, + "step": 27460 + }, + { + "epoch": 1.6126570388634496, + "grad_norm": 0.17165639996528625, + "learning_rate": 9.024133919174082e-06, + "loss": 0.0049, + "step": 27470 + }, + { + "epoch": 1.6132441000352236, + "grad_norm": 1.358237385749817, + "learning_rate": 8.997651454834527e-06, + "loss": 0.0142, + "step": 27480 + }, + { + "epoch": 1.6138311612069978, + "grad_norm": 2.021362543106079, + "learning_rate": 8.971204063137916e-06, + "loss": 0.0141, + "step": 27490 + }, + { + "epoch": 1.614418222378772, + "grad_norm": 0.008475619368255138, + "learning_rate": 8.944791766706844e-06, + "loss": 0.0047, + "step": 27500 + }, + { + "epoch": 1.615005283550546, + "grad_norm": 0.6366258859634399, + "learning_rate": 8.918414588133894e-06, + "loss": 0.0141, + "step": 27510 + }, + { + "epoch": 1.61559234472232, + "grad_norm": 0.19768893718719482, + "learning_rate": 8.892072549981622e-06, + "loss": 0.0116, + "step": 27520 + }, + { + "epoch": 1.6161794058940941, + "grad_norm": 0.5427848100662231, + "learning_rate": 8.865765674782528e-06, + "loss": 0.0142, + "step": 27530 + }, + { + "epoch": 1.6167664670658684, + "grad_norm": 1.6507068872451782, + "learning_rate": 8.839493985038988e-06, + "loss": 0.0151, + "step": 27540 + }, + { + "epoch": 1.6173535282376423, + "grad_norm": 0.026372963562607765, + "learning_rate": 8.81325750322335e-06, + "loss": 0.0111, + "step": 27550 + }, + { + "epoch": 1.6179405894094163, + "grad_norm": 0.4352494776248932, + "learning_rate": 8.78705625177777e-06, + "loss": 0.0059, + "step": 27560 + }, + { + "epoch": 1.6185276505811905, + "grad_norm": 0.27398261427879333, + "learning_rate": 8.76089025311434e-06, + "loss": 0.0036, + "step": 27570 + }, + { + "epoch": 1.6191147117529647, + "grad_norm": 1.2692209482192993, + "learning_rate": 8.734759529614956e-06, + "loss": 0.0082, + "step": 27580 + }, + { + "epoch": 1.6197017729247387, + "grad_norm": 0.17485210299491882, + "learning_rate": 8.708664103631354e-06, + "loss": 0.0089, + "step": 27590 + }, + { + "epoch": 1.6202888340965127, + "grad_norm": 2.3155698776245117, + "learning_rate": 8.682603997485078e-06, + "loss": 0.0124, + "step": 27600 + }, + { + "epoch": 1.620875895268287, + "grad_norm": 1.551294207572937, + "learning_rate": 8.656579233467443e-06, + "loss": 0.0139, + "step": 27610 + }, + { + "epoch": 1.621462956440061, + "grad_norm": 0.10387564450502396, + "learning_rate": 8.63058983383957e-06, + "loss": 0.0125, + "step": 27620 + }, + { + "epoch": 1.6220500176118353, + "grad_norm": 0.4853443205356598, + "learning_rate": 8.604635820832258e-06, + "loss": 0.0141, + "step": 27630 + }, + { + "epoch": 1.6226370787836093, + "grad_norm": 0.006195012014359236, + "learning_rate": 8.578717216646143e-06, + "loss": 0.0068, + "step": 27640 + }, + { + "epoch": 1.6232241399553833, + "grad_norm": 0.3198970556259155, + "learning_rate": 8.55283404345148e-06, + "loss": 0.0132, + "step": 27650 + }, + { + "epoch": 1.6238112011271575, + "grad_norm": 2.3255319595336914, + "learning_rate": 8.526986323388263e-06, + "loss": 0.0254, + "step": 27660 + }, + { + "epoch": 1.6243982622989317, + "grad_norm": 0.06995700299739838, + "learning_rate": 8.501174078566143e-06, + "loss": 0.022, + "step": 27670 + }, + { + "epoch": 1.6249853234707057, + "grad_norm": 0.23026344180107117, + "learning_rate": 8.475397331064427e-06, + "loss": 0.0163, + "step": 27680 + }, + { + "epoch": 1.6255723846424797, + "grad_norm": 0.19775334000587463, + "learning_rate": 8.449656102932075e-06, + "loss": 0.0026, + "step": 27690 + }, + { + "epoch": 1.6261594458142539, + "grad_norm": 0.04183727502822876, + "learning_rate": 8.42395041618766e-06, + "loss": 0.0027, + "step": 27700 + }, + { + "epoch": 1.626746506986028, + "grad_norm": 0.1087692603468895, + "learning_rate": 8.398280292819321e-06, + "loss": 0.0065, + "step": 27710 + }, + { + "epoch": 1.627333568157802, + "grad_norm": 1.2450783252716064, + "learning_rate": 8.37264575478482e-06, + "loss": 0.025, + "step": 27720 + }, + { + "epoch": 1.627920629329576, + "grad_norm": 1.7207362651824951, + "learning_rate": 8.347046824011467e-06, + "loss": 0.0188, + "step": 27730 + }, + { + "epoch": 1.6285076905013502, + "grad_norm": 0.5218789577484131, + "learning_rate": 8.321483522396084e-06, + "loss": 0.0072, + "step": 27740 + }, + { + "epoch": 1.6290947516731245, + "grad_norm": 0.4121147692203522, + "learning_rate": 8.295955871805061e-06, + "loss": 0.0144, + "step": 27750 + }, + { + "epoch": 1.6296818128448984, + "grad_norm": 0.06316855549812317, + "learning_rate": 8.27046389407427e-06, + "loss": 0.0072, + "step": 27760 + }, + { + "epoch": 1.6302688740166724, + "grad_norm": 0.19479545950889587, + "learning_rate": 8.245007611009087e-06, + "loss": 0.0108, + "step": 27770 + }, + { + "epoch": 1.6308559351884466, + "grad_norm": 0.3882885277271271, + "learning_rate": 8.219587044384307e-06, + "loss": 0.0165, + "step": 27780 + }, + { + "epoch": 1.6314429963602208, + "grad_norm": 0.05304262042045593, + "learning_rate": 8.194202215944247e-06, + "loss": 0.0105, + "step": 27790 + }, + { + "epoch": 1.6320300575319948, + "grad_norm": 2.962719440460205, + "learning_rate": 8.168853147402566e-06, + "loss": 0.0072, + "step": 27800 + }, + { + "epoch": 1.6326171187037688, + "grad_norm": 2.3828048706054688, + "learning_rate": 8.14353986044244e-06, + "loss": 0.0046, + "step": 27810 + }, + { + "epoch": 1.633204179875543, + "grad_norm": 0.10702111572027206, + "learning_rate": 8.11826237671634e-06, + "loss": 0.0064, + "step": 27820 + }, + { + "epoch": 1.6337912410473172, + "grad_norm": 0.43127015233039856, + "learning_rate": 8.093020717846177e-06, + "loss": 0.0058, + "step": 27830 + }, + { + "epoch": 1.6343783022190912, + "grad_norm": 0.07288127392530441, + "learning_rate": 8.067814905423176e-06, + "loss": 0.0185, + "step": 27840 + }, + { + "epoch": 1.6349653633908652, + "grad_norm": 0.13415968418121338, + "learning_rate": 8.042644961007927e-06, + "loss": 0.0048, + "step": 27850 + }, + { + "epoch": 1.6355524245626394, + "grad_norm": 0.9306196570396423, + "learning_rate": 8.017510906130332e-06, + "loss": 0.018, + "step": 27860 + }, + { + "epoch": 1.6361394857344136, + "grad_norm": 0.050222061574459076, + "learning_rate": 7.992412762289592e-06, + "loss": 0.0035, + "step": 27870 + }, + { + "epoch": 1.6367265469061876, + "grad_norm": 0.007918142713606358, + "learning_rate": 7.967350550954201e-06, + "loss": 0.0087, + "step": 27880 + }, + { + "epoch": 1.6373136080779616, + "grad_norm": 0.5580129623413086, + "learning_rate": 7.942324293561876e-06, + "loss": 0.0045, + "step": 27890 + }, + { + "epoch": 1.6379006692497358, + "grad_norm": 0.10453560203313828, + "learning_rate": 7.917334011519646e-06, + "loss": 0.0049, + "step": 27900 + }, + { + "epoch": 1.63848773042151, + "grad_norm": 1.7366032600402832, + "learning_rate": 7.892379726203702e-06, + "loss": 0.0136, + "step": 27910 + }, + { + "epoch": 1.6390747915932842, + "grad_norm": 0.05727103352546692, + "learning_rate": 7.86746145895948e-06, + "loss": 0.0127, + "step": 27920 + }, + { + "epoch": 1.6396618527650582, + "grad_norm": 1.9890248775482178, + "learning_rate": 7.84257923110161e-06, + "loss": 0.019, + "step": 27930 + }, + { + "epoch": 1.6402489139368321, + "grad_norm": 0.25739482045173645, + "learning_rate": 7.81773306391389e-06, + "loss": 0.0089, + "step": 27940 + }, + { + "epoch": 1.6408359751086063, + "grad_norm": 0.23821336030960083, + "learning_rate": 7.792922978649248e-06, + "loss": 0.0057, + "step": 27950 + }, + { + "epoch": 1.6414230362803806, + "grad_norm": 0.026927923783659935, + "learning_rate": 7.768148996529789e-06, + "loss": 0.0091, + "step": 27960 + }, + { + "epoch": 1.6420100974521545, + "grad_norm": 0.07286237180233002, + "learning_rate": 7.743411138746686e-06, + "loss": 0.0041, + "step": 27970 + }, + { + "epoch": 1.6425971586239285, + "grad_norm": 0.7503321766853333, + "learning_rate": 7.718709426460258e-06, + "loss": 0.0142, + "step": 27980 + }, + { + "epoch": 1.6431842197957027, + "grad_norm": 0.8196347951889038, + "learning_rate": 7.694043880799889e-06, + "loss": 0.0103, + "step": 27990 + }, + { + "epoch": 1.643771280967477, + "grad_norm": 0.20796020328998566, + "learning_rate": 7.669414522864028e-06, + "loss": 0.0209, + "step": 28000 + }, + { + "epoch": 1.644358342139251, + "grad_norm": 0.14734028279781342, + "learning_rate": 7.644821373720168e-06, + "loss": 0.0077, + "step": 28010 + }, + { + "epoch": 1.644945403311025, + "grad_norm": 0.03402048721909523, + "learning_rate": 7.620264454404819e-06, + "loss": 0.0189, + "step": 28020 + }, + { + "epoch": 1.645532464482799, + "grad_norm": 0.07426488399505615, + "learning_rate": 7.595743785923515e-06, + "loss": 0.0134, + "step": 28030 + }, + { + "epoch": 1.6461195256545733, + "grad_norm": 0.16101300716400146, + "learning_rate": 7.571259389250779e-06, + "loss": 0.009, + "step": 28040 + }, + { + "epoch": 1.6467065868263473, + "grad_norm": 1.456742525100708, + "learning_rate": 7.546811285330119e-06, + "loss": 0.0126, + "step": 28050 + }, + { + "epoch": 1.6472936479981213, + "grad_norm": 0.04736657440662384, + "learning_rate": 7.522399495073962e-06, + "loss": 0.0101, + "step": 28060 + }, + { + "epoch": 1.6478807091698955, + "grad_norm": 0.043693553656339645, + "learning_rate": 7.4980240393637216e-06, + "loss": 0.0077, + "step": 28070 + }, + { + "epoch": 1.6484677703416697, + "grad_norm": 1.484106421470642, + "learning_rate": 7.473684939049685e-06, + "loss": 0.0156, + "step": 28080 + }, + { + "epoch": 1.6490548315134437, + "grad_norm": 0.08472202718257904, + "learning_rate": 7.449382214951073e-06, + "loss": 0.0081, + "step": 28090 + }, + { + "epoch": 1.6496418926852177, + "grad_norm": 0.06400839239358902, + "learning_rate": 7.425115887855983e-06, + "loss": 0.0129, + "step": 28100 + }, + { + "epoch": 1.6502289538569919, + "grad_norm": 0.02137874811887741, + "learning_rate": 7.400885978521393e-06, + "loss": 0.0042, + "step": 28110 + }, + { + "epoch": 1.650816015028766, + "grad_norm": 0.3920609951019287, + "learning_rate": 7.376692507673083e-06, + "loss": 0.0171, + "step": 28120 + }, + { + "epoch": 1.65140307620054, + "grad_norm": 2.4398319721221924, + "learning_rate": 7.3525354960057195e-06, + "loss": 0.0076, + "step": 28130 + }, + { + "epoch": 1.651990137372314, + "grad_norm": 0.4373569190502167, + "learning_rate": 7.328414964182756e-06, + "loss": 0.0069, + "step": 28140 + }, + { + "epoch": 1.6525771985440882, + "grad_norm": 2.4919962882995605, + "learning_rate": 7.304330932836434e-06, + "loss": 0.0142, + "step": 28150 + }, + { + "epoch": 1.6531642597158624, + "grad_norm": 0.48688334226608276, + "learning_rate": 7.2802834225677905e-06, + "loss": 0.011, + "step": 28160 + }, + { + "epoch": 1.6537513208876367, + "grad_norm": 1.0665478706359863, + "learning_rate": 7.256272453946616e-06, + "loss": 0.0178, + "step": 28170 + }, + { + "epoch": 1.6543383820594106, + "grad_norm": 0.024595128372311592, + "learning_rate": 7.23229804751146e-06, + "loss": 0.0144, + "step": 28180 + }, + { + "epoch": 1.6549254432311846, + "grad_norm": 0.3491171598434448, + "learning_rate": 7.208360223769555e-06, + "loss": 0.0084, + "step": 28190 + }, + { + "epoch": 1.6555125044029588, + "grad_norm": 1.6058762073516846, + "learning_rate": 7.184459003196892e-06, + "loss": 0.0079, + "step": 28200 + }, + { + "epoch": 1.656099565574733, + "grad_norm": 0.01545692328363657, + "learning_rate": 7.1605944062380916e-06, + "loss": 0.0065, + "step": 28210 + }, + { + "epoch": 1.656686626746507, + "grad_norm": 2.165834903717041, + "learning_rate": 7.136766453306537e-06, + "loss": 0.0122, + "step": 28220 + }, + { + "epoch": 1.657273687918281, + "grad_norm": 0.23048318922519684, + "learning_rate": 7.112975164784175e-06, + "loss": 0.0078, + "step": 28230 + }, + { + "epoch": 1.6578607490900552, + "grad_norm": 1.7704715728759766, + "learning_rate": 7.089220561021648e-06, + "loss": 0.011, + "step": 28240 + }, + { + "epoch": 1.6584478102618294, + "grad_norm": 0.11009885370731354, + "learning_rate": 7.065502662338186e-06, + "loss": 0.0023, + "step": 28250 + }, + { + "epoch": 1.6590348714336034, + "grad_norm": 0.6416253447532654, + "learning_rate": 7.041821489021639e-06, + "loss": 0.0097, + "step": 28260 + }, + { + "epoch": 1.6596219326053774, + "grad_norm": 1.033013939857483, + "learning_rate": 7.018177061328451e-06, + "loss": 0.0175, + "step": 28270 + }, + { + "epoch": 1.6602089937771516, + "grad_norm": 4.628103733062744, + "learning_rate": 6.994569399483614e-06, + "loss": 0.0085, + "step": 28280 + }, + { + "epoch": 1.6607960549489258, + "grad_norm": 0.5595627427101135, + "learning_rate": 6.9709985236807e-06, + "loss": 0.006, + "step": 28290 + }, + { + "epoch": 1.6613831161206998, + "grad_norm": 1.0199556350708008, + "learning_rate": 6.947464454081765e-06, + "loss": 0.0033, + "step": 28300 + }, + { + "epoch": 1.6619701772924738, + "grad_norm": 0.22090382874011993, + "learning_rate": 6.923967210817439e-06, + "loss": 0.008, + "step": 28310 + }, + { + "epoch": 1.662557238464248, + "grad_norm": 0.017477478832006454, + "learning_rate": 6.900506813986806e-06, + "loss": 0.0088, + "step": 28320 + }, + { + "epoch": 1.6631442996360222, + "grad_norm": 1.4774725437164307, + "learning_rate": 6.8770832836574596e-06, + "loss": 0.0091, + "step": 28330 + }, + { + "epoch": 1.6637313608077962, + "grad_norm": 0.5444540977478027, + "learning_rate": 6.853696639865448e-06, + "loss": 0.0054, + "step": 28340 + }, + { + "epoch": 1.6643184219795701, + "grad_norm": 1.9333986043930054, + "learning_rate": 6.830346902615281e-06, + "loss": 0.0223, + "step": 28350 + }, + { + "epoch": 1.6649054831513443, + "grad_norm": 1.0381861925125122, + "learning_rate": 6.807034091879866e-06, + "loss": 0.0101, + "step": 28360 + }, + { + "epoch": 1.6654925443231186, + "grad_norm": 0.144987091422081, + "learning_rate": 6.783758227600567e-06, + "loss": 0.007, + "step": 28370 + }, + { + "epoch": 1.6660796054948925, + "grad_norm": 0.34596773982048035, + "learning_rate": 6.760519329687099e-06, + "loss": 0.0064, + "step": 28380 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.10488889366388321, + "learning_rate": 6.737317418017608e-06, + "loss": 0.0112, + "step": 28390 + }, + { + "epoch": 1.6672537278384407, + "grad_norm": 1.5192146301269531, + "learning_rate": 6.7141525124385595e-06, + "loss": 0.0049, + "step": 28400 + }, + { + "epoch": 1.667840789010215, + "grad_norm": 0.5066091418266296, + "learning_rate": 6.6910246327647864e-06, + "loss": 0.0137, + "step": 28410 + }, + { + "epoch": 1.668427850181989, + "grad_norm": 0.49690625071525574, + "learning_rate": 6.667933798779447e-06, + "loss": 0.0034, + "step": 28420 + }, + { + "epoch": 1.669014911353763, + "grad_norm": 0.579086184501648, + "learning_rate": 6.644880030234002e-06, + "loss": 0.0054, + "step": 28430 + }, + { + "epoch": 1.669601972525537, + "grad_norm": 0.1151830330491066, + "learning_rate": 6.621863346848217e-06, + "loss": 0.0055, + "step": 28440 + }, + { + "epoch": 1.6701890336973113, + "grad_norm": 0.4732290804386139, + "learning_rate": 6.598883768310133e-06, + "loss": 0.0145, + "step": 28450 + }, + { + "epoch": 1.6707760948690855, + "grad_norm": 0.1567830741405487, + "learning_rate": 6.575941314276063e-06, + "loss": 0.0038, + "step": 28460 + }, + { + "epoch": 1.6713631560408595, + "grad_norm": 0.04848311096429825, + "learning_rate": 6.553036004370533e-06, + "loss": 0.0074, + "step": 28470 + }, + { + "epoch": 1.6719502172126335, + "grad_norm": 0.13380120694637299, + "learning_rate": 6.530167858186342e-06, + "loss": 0.0083, + "step": 28480 + }, + { + "epoch": 1.6725372783844077, + "grad_norm": 0.18455231189727783, + "learning_rate": 6.507336895284449e-06, + "loss": 0.0047, + "step": 28490 + }, + { + "epoch": 1.673124339556182, + "grad_norm": 0.643710732460022, + "learning_rate": 6.484543135194043e-06, + "loss": 0.0094, + "step": 28500 + }, + { + "epoch": 1.6737114007279559, + "grad_norm": 0.23137906193733215, + "learning_rate": 6.461786597412489e-06, + "loss": 0.0031, + "step": 28510 + }, + { + "epoch": 1.6742984618997299, + "grad_norm": 2.435506582260132, + "learning_rate": 6.439067301405305e-06, + "loss": 0.0238, + "step": 28520 + }, + { + "epoch": 1.674885523071504, + "grad_norm": 0.7083292007446289, + "learning_rate": 6.416385266606134e-06, + "loss": 0.0108, + "step": 28530 + }, + { + "epoch": 1.6754725842432783, + "grad_norm": 0.7685606479644775, + "learning_rate": 6.393740512416785e-06, + "loss": 0.0056, + "step": 28540 + }, + { + "epoch": 1.6760596454150523, + "grad_norm": 0.1088830903172493, + "learning_rate": 6.37113305820714e-06, + "loss": 0.0144, + "step": 28550 + }, + { + "epoch": 1.6766467065868262, + "grad_norm": 0.5206865072250366, + "learning_rate": 6.348562923315194e-06, + "loss": 0.0057, + "step": 28560 + }, + { + "epoch": 1.6772337677586004, + "grad_norm": 1.1275566816329956, + "learning_rate": 6.326030127047045e-06, + "loss": 0.0117, + "step": 28570 + }, + { + "epoch": 1.6778208289303747, + "grad_norm": 1.9255837202072144, + "learning_rate": 6.303534688676799e-06, + "loss": 0.0099, + "step": 28580 + }, + { + "epoch": 1.6784078901021486, + "grad_norm": 1.0019365549087524, + "learning_rate": 6.281076627446652e-06, + "loss": 0.0356, + "step": 28590 + }, + { + "epoch": 1.6789949512739226, + "grad_norm": 0.17412810027599335, + "learning_rate": 6.25865596256679e-06, + "loss": 0.0031, + "step": 28600 + }, + { + "epoch": 1.6795820124456968, + "grad_norm": 2.0009384155273438, + "learning_rate": 6.236272713215441e-06, + "loss": 0.0114, + "step": 28610 + }, + { + "epoch": 1.680169073617471, + "grad_norm": 3.1755568981170654, + "learning_rate": 6.213926898538825e-06, + "loss": 0.0117, + "step": 28620 + }, + { + "epoch": 1.680756134789245, + "grad_norm": 2.153243064880371, + "learning_rate": 6.1916185376511286e-06, + "loss": 0.0109, + "step": 28630 + }, + { + "epoch": 1.681343195961019, + "grad_norm": 0.05015252158045769, + "learning_rate": 6.1693476496344996e-06, + "loss": 0.0088, + "step": 28640 + }, + { + "epoch": 1.6819302571327932, + "grad_norm": 0.577706515789032, + "learning_rate": 6.14711425353906e-06, + "loss": 0.0025, + "step": 28650 + }, + { + "epoch": 1.6825173183045674, + "grad_norm": 0.40677276253700256, + "learning_rate": 6.124918368382815e-06, + "loss": 0.0114, + "step": 28660 + }, + { + "epoch": 1.6831043794763414, + "grad_norm": 1.0793112516403198, + "learning_rate": 6.1027600131517205e-06, + "loss": 0.0093, + "step": 28670 + }, + { + "epoch": 1.6836914406481154, + "grad_norm": 0.9636606574058533, + "learning_rate": 6.080639206799626e-06, + "loss": 0.0291, + "step": 28680 + }, + { + "epoch": 1.6842785018198896, + "grad_norm": 0.07298687845468521, + "learning_rate": 6.058555968248247e-06, + "loss": 0.0133, + "step": 28690 + }, + { + "epoch": 1.6848655629916638, + "grad_norm": 0.35231101512908936, + "learning_rate": 6.036510316387195e-06, + "loss": 0.0166, + "step": 28700 + }, + { + "epoch": 1.685452624163438, + "grad_norm": 0.8759594559669495, + "learning_rate": 6.014502270073874e-06, + "loss": 0.0076, + "step": 28710 + }, + { + "epoch": 1.686039685335212, + "grad_norm": 0.03797895833849907, + "learning_rate": 5.9925318481335925e-06, + "loss": 0.0108, + "step": 28720 + }, + { + "epoch": 1.686626746506986, + "grad_norm": 1.683166265487671, + "learning_rate": 5.970599069359395e-06, + "loss": 0.0182, + "step": 28730 + }, + { + "epoch": 1.6872138076787602, + "grad_norm": 0.07160099595785141, + "learning_rate": 5.948703952512214e-06, + "loss": 0.0087, + "step": 28740 + }, + { + "epoch": 1.6878008688505344, + "grad_norm": 0.00314601743593812, + "learning_rate": 5.9268465163207e-06, + "loss": 0.0104, + "step": 28750 + }, + { + "epoch": 1.6883879300223084, + "grad_norm": 0.02015577256679535, + "learning_rate": 5.9050267794813045e-06, + "loss": 0.0073, + "step": 28760 + }, + { + "epoch": 1.6889749911940823, + "grad_norm": 0.03103172965347767, + "learning_rate": 5.883244760658213e-06, + "loss": 0.0085, + "step": 28770 + }, + { + "epoch": 1.6895620523658565, + "grad_norm": 1.1322954893112183, + "learning_rate": 5.861500478483362e-06, + "loss": 0.0179, + "step": 28780 + }, + { + "epoch": 1.6901491135376308, + "grad_norm": 0.17238181829452515, + "learning_rate": 5.83979395155641e-06, + "loss": 0.0035, + "step": 28790 + }, + { + "epoch": 1.6907361747094047, + "grad_norm": 0.24105772376060486, + "learning_rate": 5.818125198444713e-06, + "loss": 0.0104, + "step": 28800 + }, + { + "epoch": 1.6913232358811787, + "grad_norm": 0.44330739974975586, + "learning_rate": 5.796494237683309e-06, + "loss": 0.0097, + "step": 28810 + }, + { + "epoch": 1.691910297052953, + "grad_norm": 0.19631382822990417, + "learning_rate": 5.774901087774937e-06, + "loss": 0.0133, + "step": 28820 + }, + { + "epoch": 1.6924973582247271, + "grad_norm": 0.09199246764183044, + "learning_rate": 5.753345767189949e-06, + "loss": 0.0025, + "step": 28830 + }, + { + "epoch": 1.6930844193965011, + "grad_norm": 0.9267358183860779, + "learning_rate": 5.73182829436637e-06, + "loss": 0.0114, + "step": 28840 + }, + { + "epoch": 1.693671480568275, + "grad_norm": 3.330157995223999, + "learning_rate": 5.710348687709855e-06, + "loss": 0.0036, + "step": 28850 + }, + { + "epoch": 1.6942585417400493, + "grad_norm": 1.3673113584518433, + "learning_rate": 5.688906965593649e-06, + "loss": 0.0138, + "step": 28860 + }, + { + "epoch": 1.6948456029118235, + "grad_norm": 0.03584679961204529, + "learning_rate": 5.667503146358616e-06, + "loss": 0.0112, + "step": 28870 + }, + { + "epoch": 1.6954326640835975, + "grad_norm": 0.026026401668787003, + "learning_rate": 5.64613724831316e-06, + "loss": 0.0166, + "step": 28880 + }, + { + "epoch": 1.6960197252553715, + "grad_norm": 1.8361384868621826, + "learning_rate": 5.624809289733296e-06, + "loss": 0.0077, + "step": 28890 + }, + { + "epoch": 1.6966067864271457, + "grad_norm": 0.43284550309181213, + "learning_rate": 5.603519288862536e-06, + "loss": 0.0083, + "step": 28900 + }, + { + "epoch": 1.69719384759892, + "grad_norm": 0.04571918770670891, + "learning_rate": 5.582267263911961e-06, + "loss": 0.0066, + "step": 28910 + }, + { + "epoch": 1.6977809087706939, + "grad_norm": 0.05616581812500954, + "learning_rate": 5.561053233060154e-06, + "loss": 0.0105, + "step": 28920 + }, + { + "epoch": 1.6983679699424679, + "grad_norm": 0.4561821520328522, + "learning_rate": 5.539877214453215e-06, + "loss": 0.0103, + "step": 28930 + }, + { + "epoch": 1.698955031114242, + "grad_norm": 0.29431581497192383, + "learning_rate": 5.518739226204689e-06, + "loss": 0.0044, + "step": 28940 + }, + { + "epoch": 1.6995420922860163, + "grad_norm": 1.0841506719589233, + "learning_rate": 5.497639286395645e-06, + "loss": 0.0124, + "step": 28950 + }, + { + "epoch": 1.7001291534577903, + "grad_norm": 1.3967875242233276, + "learning_rate": 5.476577413074535e-06, + "loss": 0.0079, + "step": 28960 + }, + { + "epoch": 1.7007162146295642, + "grad_norm": 0.29343652725219727, + "learning_rate": 5.455553624257331e-06, + "loss": 0.022, + "step": 28970 + }, + { + "epoch": 1.7013032758013384, + "grad_norm": 0.02657872438430786, + "learning_rate": 5.434567937927387e-06, + "loss": 0.0043, + "step": 28980 + }, + { + "epoch": 1.7018903369731126, + "grad_norm": 1.2220739126205444, + "learning_rate": 5.413620372035449e-06, + "loss": 0.0114, + "step": 28990 + }, + { + "epoch": 1.7024773981448869, + "grad_norm": 0.11572136729955673, + "learning_rate": 5.39271094449969e-06, + "loss": 0.0058, + "step": 29000 + }, + { + "epoch": 1.7030644593166608, + "grad_norm": 0.5487588047981262, + "learning_rate": 5.371839673205625e-06, + "loss": 0.0049, + "step": 29010 + }, + { + "epoch": 1.7036515204884348, + "grad_norm": 0.038064517080783844, + "learning_rate": 5.351006576006162e-06, + "loss": 0.0153, + "step": 29020 + }, + { + "epoch": 1.704238581660209, + "grad_norm": 0.040291767567396164, + "learning_rate": 5.330211670721535e-06, + "loss": 0.0068, + "step": 29030 + }, + { + "epoch": 1.7048256428319832, + "grad_norm": 0.7021271586418152, + "learning_rate": 5.309454975139338e-06, + "loss": 0.0038, + "step": 29040 + }, + { + "epoch": 1.7054127040037572, + "grad_norm": 2.8790230751037598, + "learning_rate": 5.288736507014435e-06, + "loss": 0.0345, + "step": 29050 + }, + { + "epoch": 1.7059997651755312, + "grad_norm": 1.4290226697921753, + "learning_rate": 5.26805628406904e-06, + "loss": 0.0055, + "step": 29060 + }, + { + "epoch": 1.7065868263473054, + "grad_norm": 0.519660472869873, + "learning_rate": 5.247414323992605e-06, + "loss": 0.0048, + "step": 29070 + }, + { + "epoch": 1.7071738875190796, + "grad_norm": 1.1711570024490356, + "learning_rate": 5.2268106444418875e-06, + "loss": 0.011, + "step": 29080 + }, + { + "epoch": 1.7077609486908536, + "grad_norm": 0.897144079208374, + "learning_rate": 5.206245263040893e-06, + "loss": 0.0265, + "step": 29090 + }, + { + "epoch": 1.7083480098626276, + "grad_norm": 1.1236882209777832, + "learning_rate": 5.1857181973808735e-06, + "loss": 0.0096, + "step": 29100 + }, + { + "epoch": 1.7089350710344018, + "grad_norm": 0.45875832438468933, + "learning_rate": 5.165229465020277e-06, + "loss": 0.008, + "step": 29110 + }, + { + "epoch": 1.709522132206176, + "grad_norm": 0.37724724411964417, + "learning_rate": 5.144779083484791e-06, + "loss": 0.0036, + "step": 29120 + }, + { + "epoch": 1.71010919337795, + "grad_norm": 1.1803650856018066, + "learning_rate": 5.1243670702673e-06, + "loss": 0.0059, + "step": 29130 + }, + { + "epoch": 1.710696254549724, + "grad_norm": 1.0297425985336304, + "learning_rate": 5.103993442827831e-06, + "loss": 0.0088, + "step": 29140 + }, + { + "epoch": 1.7112833157214982, + "grad_norm": 0.011964410543441772, + "learning_rate": 5.0836582185936456e-06, + "loss": 0.0027, + "step": 29150 + }, + { + "epoch": 1.7118703768932724, + "grad_norm": 1.340688943862915, + "learning_rate": 5.063361414959083e-06, + "loss": 0.0061, + "step": 29160 + }, + { + "epoch": 1.7124574380650464, + "grad_norm": 1.056504726409912, + "learning_rate": 5.043103049285663e-06, + "loss": 0.006, + "step": 29170 + }, + { + "epoch": 1.7130444992368203, + "grad_norm": 0.5750206708908081, + "learning_rate": 5.022883138902007e-06, + "loss": 0.0033, + "step": 29180 + }, + { + "epoch": 1.7136315604085945, + "grad_norm": 0.10166315734386444, + "learning_rate": 5.002701701103846e-06, + "loss": 0.006, + "step": 29190 + }, + { + "epoch": 1.7142186215803688, + "grad_norm": 2.292682647705078, + "learning_rate": 4.982558753154009e-06, + "loss": 0.0072, + "step": 29200 + }, + { + "epoch": 1.7148056827521427, + "grad_norm": 0.36749956011772156, + "learning_rate": 4.962454312282411e-06, + "loss": 0.006, + "step": 29210 + }, + { + "epoch": 1.7153927439239167, + "grad_norm": 0.2949482500553131, + "learning_rate": 4.942388395685993e-06, + "loss": 0.0075, + "step": 29220 + }, + { + "epoch": 1.715979805095691, + "grad_norm": 0.047629375010728836, + "learning_rate": 4.922361020528782e-06, + "loss": 0.0055, + "step": 29230 + }, + { + "epoch": 1.7165668662674651, + "grad_norm": 0.8646748661994934, + "learning_rate": 4.9023722039418015e-06, + "loss": 0.0045, + "step": 29240 + }, + { + "epoch": 1.7171539274392391, + "grad_norm": 0.06569618731737137, + "learning_rate": 4.882421963023126e-06, + "loss": 0.0045, + "step": 29250 + }, + { + "epoch": 1.7177409886110133, + "grad_norm": 0.7197391390800476, + "learning_rate": 4.86251031483782e-06, + "loss": 0.0031, + "step": 29260 + }, + { + "epoch": 1.7183280497827873, + "grad_norm": 1.2318994998931885, + "learning_rate": 4.842637276417927e-06, + "loss": 0.0089, + "step": 29270 + }, + { + "epoch": 1.7189151109545615, + "grad_norm": 0.0834750384092331, + "learning_rate": 4.822802864762488e-06, + "loss": 0.0053, + "step": 29280 + }, + { + "epoch": 1.7195021721263357, + "grad_norm": 0.3922010362148285, + "learning_rate": 4.80300709683747e-06, + "loss": 0.0045, + "step": 29290 + }, + { + "epoch": 1.7200892332981097, + "grad_norm": 1.6912459135055542, + "learning_rate": 4.7832499895758166e-06, + "loss": 0.0129, + "step": 29300 + }, + { + "epoch": 1.7206762944698837, + "grad_norm": 3.4761576652526855, + "learning_rate": 4.76353155987736e-06, + "loss": 0.0114, + "step": 29310 + }, + { + "epoch": 1.7212633556416579, + "grad_norm": 0.8199915885925293, + "learning_rate": 4.7438518246089245e-06, + "loss": 0.0038, + "step": 29320 + }, + { + "epoch": 1.721850416813432, + "grad_norm": 0.5764483213424683, + "learning_rate": 4.724210800604151e-06, + "loss": 0.0115, + "step": 29330 + }, + { + "epoch": 1.722437477985206, + "grad_norm": 0.8750811219215393, + "learning_rate": 4.704608504663627e-06, + "loss": 0.0288, + "step": 29340 + }, + { + "epoch": 1.72302453915698, + "grad_norm": 0.31287306547164917, + "learning_rate": 4.685044953554768e-06, + "loss": 0.0036, + "step": 29350 + }, + { + "epoch": 1.7236116003287543, + "grad_norm": 1.7458490133285522, + "learning_rate": 4.6655201640118775e-06, + "loss": 0.0221, + "step": 29360 + }, + { + "epoch": 1.7241986615005285, + "grad_norm": 0.10908584296703339, + "learning_rate": 4.646034152736101e-06, + "loss": 0.0082, + "step": 29370 + }, + { + "epoch": 1.7247857226723025, + "grad_norm": 5.732516765594482, + "learning_rate": 4.626586936395411e-06, + "loss": 0.0164, + "step": 29380 + }, + { + "epoch": 1.7253727838440764, + "grad_norm": 0.14866457879543304, + "learning_rate": 4.607178531624595e-06, + "loss": 0.0053, + "step": 29390 + }, + { + "epoch": 1.7259598450158506, + "grad_norm": 0.7560992240905762, + "learning_rate": 4.5878089550252246e-06, + "loss": 0.0149, + "step": 29400 + }, + { + "epoch": 1.7265469061876249, + "grad_norm": 0.5187071561813354, + "learning_rate": 4.568478223165696e-06, + "loss": 0.0127, + "step": 29410 + }, + { + "epoch": 1.7271339673593988, + "grad_norm": 0.7606738805770874, + "learning_rate": 4.549186352581131e-06, + "loss": 0.005, + "step": 29420 + }, + { + "epoch": 1.7277210285311728, + "grad_norm": 0.011919623240828514, + "learning_rate": 4.529933359773447e-06, + "loss": 0.0138, + "step": 29430 + }, + { + "epoch": 1.728308089702947, + "grad_norm": 2.2456302642822266, + "learning_rate": 4.510719261211293e-06, + "loss": 0.0097, + "step": 29440 + }, + { + "epoch": 1.7288951508747212, + "grad_norm": 0.3139123320579529, + "learning_rate": 4.491544073330062e-06, + "loss": 0.0175, + "step": 29450 + }, + { + "epoch": 1.7294822120464952, + "grad_norm": 0.6753292083740234, + "learning_rate": 4.472407812531831e-06, + "loss": 0.0053, + "step": 29460 + }, + { + "epoch": 1.7300692732182692, + "grad_norm": 0.02748318947851658, + "learning_rate": 4.4533104951854255e-06, + "loss": 0.0076, + "step": 29470 + }, + { + "epoch": 1.7306563343900434, + "grad_norm": 0.0061375536024570465, + "learning_rate": 4.434252137626305e-06, + "loss": 0.0072, + "step": 29480 + }, + { + "epoch": 1.7312433955618176, + "grad_norm": 1.0382859706878662, + "learning_rate": 4.4152327561566455e-06, + "loss": 0.008, + "step": 29490 + }, + { + "epoch": 1.7318304567335916, + "grad_norm": 0.4101167321205139, + "learning_rate": 4.3962523670452725e-06, + "loss": 0.0043, + "step": 29500 + }, + { + "epoch": 1.7324175179053656, + "grad_norm": 1.11329984664917, + "learning_rate": 4.37731098652766e-06, + "loss": 0.0084, + "step": 29510 + }, + { + "epoch": 1.7330045790771398, + "grad_norm": 0.03493981435894966, + "learning_rate": 4.358408630805905e-06, + "loss": 0.0094, + "step": 29520 + }, + { + "epoch": 1.733591640248914, + "grad_norm": 0.4795690178871155, + "learning_rate": 4.339545316048721e-06, + "loss": 0.0064, + "step": 29530 + }, + { + "epoch": 1.7341787014206882, + "grad_norm": 0.4783584475517273, + "learning_rate": 4.320721058391453e-06, + "loss": 0.0092, + "step": 29540 + }, + { + "epoch": 1.7347657625924622, + "grad_norm": 0.235016867518425, + "learning_rate": 4.301935873936003e-06, + "loss": 0.0105, + "step": 29550 + }, + { + "epoch": 1.7353528237642362, + "grad_norm": 2.073943614959717, + "learning_rate": 4.28318977875089e-06, + "loss": 0.0114, + "step": 29560 + }, + { + "epoch": 1.7359398849360104, + "grad_norm": 0.09469784796237946, + "learning_rate": 4.264482788871149e-06, + "loss": 0.0085, + "step": 29570 + }, + { + "epoch": 1.7365269461077846, + "grad_norm": 0.6828670501708984, + "learning_rate": 4.245814920298402e-06, + "loss": 0.0066, + "step": 29580 + }, + { + "epoch": 1.7371140072795586, + "grad_norm": 1.3681111335754395, + "learning_rate": 4.227186189000787e-06, + "loss": 0.0088, + "step": 29590 + }, + { + "epoch": 1.7377010684513325, + "grad_norm": 0.7485164999961853, + "learning_rate": 4.2085966109129796e-06, + "loss": 0.0149, + "step": 29600 + }, + { + "epoch": 1.7382881296231067, + "grad_norm": 0.06022016331553459, + "learning_rate": 4.190046201936154e-06, + "loss": 0.0017, + "step": 29610 + }, + { + "epoch": 1.738875190794881, + "grad_norm": 0.6397087574005127, + "learning_rate": 4.171534977937991e-06, + "loss": 0.008, + "step": 29620 + }, + { + "epoch": 1.739462251966655, + "grad_norm": 0.5424182415008545, + "learning_rate": 4.153062954752635e-06, + "loss": 0.0059, + "step": 29630 + }, + { + "epoch": 1.740049313138429, + "grad_norm": 0.2838616967201233, + "learning_rate": 4.134630148180724e-06, + "loss": 0.0057, + "step": 29640 + }, + { + "epoch": 1.7406363743102031, + "grad_norm": 0.9157633781433105, + "learning_rate": 4.1162365739893125e-06, + "loss": 0.0068, + "step": 29650 + }, + { + "epoch": 1.7412234354819773, + "grad_norm": 0.3759089708328247, + "learning_rate": 4.0978822479119325e-06, + "loss": 0.0123, + "step": 29660 + }, + { + "epoch": 1.7418104966537513, + "grad_norm": 0.3383537530899048, + "learning_rate": 4.0795671856485475e-06, + "loss": 0.0051, + "step": 29670 + }, + { + "epoch": 1.7423975578255253, + "grad_norm": 0.35818520188331604, + "learning_rate": 4.061291402865497e-06, + "loss": 0.0055, + "step": 29680 + }, + { + "epoch": 1.7429846189972995, + "grad_norm": 0.48298370838165283, + "learning_rate": 4.043054915195566e-06, + "loss": 0.006, + "step": 29690 + }, + { + "epoch": 1.7435716801690737, + "grad_norm": 0.020862950012087822, + "learning_rate": 4.024857738237875e-06, + "loss": 0.0111, + "step": 29700 + }, + { + "epoch": 1.7441587413408477, + "grad_norm": 0.2464524656534195, + "learning_rate": 4.006699887557974e-06, + "loss": 0.0035, + "step": 29710 + }, + { + "epoch": 1.7447458025126217, + "grad_norm": 0.36007267236709595, + "learning_rate": 3.988581378687739e-06, + "loss": 0.0125, + "step": 29720 + }, + { + "epoch": 1.7453328636843959, + "grad_norm": 0.5455897450447083, + "learning_rate": 3.970502227125417e-06, + "loss": 0.0073, + "step": 29730 + }, + { + "epoch": 1.74591992485617, + "grad_norm": 4.634277820587158, + "learning_rate": 3.952462448335553e-06, + "loss": 0.0136, + "step": 29740 + }, + { + "epoch": 1.746506986027944, + "grad_norm": 0.4345282018184662, + "learning_rate": 3.934462057749067e-06, + "loss": 0.0085, + "step": 29750 + }, + { + "epoch": 1.747094047199718, + "grad_norm": 0.7044300436973572, + "learning_rate": 3.916501070763124e-06, + "loss": 0.0047, + "step": 29760 + }, + { + "epoch": 1.7476811083714923, + "grad_norm": 1.710367202758789, + "learning_rate": 3.898579502741234e-06, + "loss": 0.008, + "step": 29770 + }, + { + "epoch": 1.7482681695432665, + "grad_norm": 1.0720694065093994, + "learning_rate": 3.88069736901317e-06, + "loss": 0.0053, + "step": 29780 + }, + { + "epoch": 1.7488552307150405, + "grad_norm": 0.8815639615058899, + "learning_rate": 3.8628546848749895e-06, + "loss": 0.0106, + "step": 29790 + }, + { + "epoch": 1.7494422918868147, + "grad_norm": 0.6148375868797302, + "learning_rate": 3.845051465588962e-06, + "loss": 0.0148, + "step": 29800 + }, + { + "epoch": 1.7500293530585886, + "grad_norm": 0.7402764558792114, + "learning_rate": 3.827287726383644e-06, + "loss": 0.0113, + "step": 29810 + }, + { + "epoch": 1.7506164142303628, + "grad_norm": 0.008104000240564346, + "learning_rate": 3.809563482453815e-06, + "loss": 0.0087, + "step": 29820 + }, + { + "epoch": 1.751203475402137, + "grad_norm": 0.34709659218788147, + "learning_rate": 3.7918787489604477e-06, + "loss": 0.0105, + "step": 29830 + }, + { + "epoch": 1.751790536573911, + "grad_norm": 1.615087628364563, + "learning_rate": 3.7742335410307306e-06, + "loss": 0.0073, + "step": 29840 + }, + { + "epoch": 1.752377597745685, + "grad_norm": 0.2822687327861786, + "learning_rate": 3.7566278737580563e-06, + "loss": 0.0041, + "step": 29850 + }, + { + "epoch": 1.7529646589174592, + "grad_norm": 0.0552225224673748, + "learning_rate": 3.7390617622019897e-06, + "loss": 0.0033, + "step": 29860 + }, + { + "epoch": 1.7535517200892334, + "grad_norm": 0.9881157875061035, + "learning_rate": 3.7215352213882338e-06, + "loss": 0.0101, + "step": 29870 + }, + { + "epoch": 1.7541387812610074, + "grad_norm": 0.030385082587599754, + "learning_rate": 3.704048266308685e-06, + "loss": 0.0121, + "step": 29880 + }, + { + "epoch": 1.7547258424327814, + "grad_norm": 0.033014968037605286, + "learning_rate": 3.6866009119213283e-06, + "loss": 0.0086, + "step": 29890 + }, + { + "epoch": 1.7553129036045556, + "grad_norm": 0.3687360882759094, + "learning_rate": 3.6691931731503425e-06, + "loss": 0.0102, + "step": 29900 + }, + { + "epoch": 1.7558999647763298, + "grad_norm": 0.4449838101863861, + "learning_rate": 3.651825064885955e-06, + "loss": 0.0165, + "step": 29910 + }, + { + "epoch": 1.7564870259481038, + "grad_norm": 0.35303670167922974, + "learning_rate": 3.6344966019845385e-06, + "loss": 0.0044, + "step": 29920 + }, + { + "epoch": 1.7570740871198778, + "grad_norm": 0.14896978437900543, + "learning_rate": 3.6172077992685182e-06, + "loss": 0.0031, + "step": 29930 + }, + { + "epoch": 1.757661148291652, + "grad_norm": 0.06323855370283127, + "learning_rate": 3.5999586715264267e-06, + "loss": 0.0104, + "step": 29940 + }, + { + "epoch": 1.7582482094634262, + "grad_norm": 1.1001636981964111, + "learning_rate": 3.5827492335128333e-06, + "loss": 0.0045, + "step": 29950 + }, + { + "epoch": 1.7588352706352002, + "grad_norm": 0.923254132270813, + "learning_rate": 3.5655794999483847e-06, + "loss": 0.01, + "step": 29960 + }, + { + "epoch": 1.7594223318069742, + "grad_norm": 1.3027571439743042, + "learning_rate": 3.5484494855197505e-06, + "loss": 0.0047, + "step": 29970 + }, + { + "epoch": 1.7600093929787484, + "grad_norm": 0.7019350528717041, + "learning_rate": 3.5313592048796086e-06, + "loss": 0.0105, + "step": 29980 + }, + { + "epoch": 1.7605964541505226, + "grad_norm": 0.11337191611528397, + "learning_rate": 3.514308672646682e-06, + "loss": 0.0063, + "step": 29990 + }, + { + "epoch": 1.7611835153222966, + "grad_norm": 0.44551828503608704, + "learning_rate": 3.497297903405666e-06, + "loss": 0.0087, + "step": 30000 + }, + { + "epoch": 1.7611835153222966, + "eval_loss": 0.5214746594429016, + "eval_runtime": 269.6304, + "eval_samples_per_second": 3.505, + "eval_steps_per_second": 3.505, + "step": 30000 + }, + { + "epoch": 1.7617705764940705, + "grad_norm": 0.008360777050256729, + "learning_rate": 3.4803269117072546e-06, + "loss": 0.0062, + "step": 30010 + }, + { + "epoch": 1.7623576376658447, + "grad_norm": 0.7833142876625061, + "learning_rate": 3.4633957120681293e-06, + "loss": 0.011, + "step": 30020 + }, + { + "epoch": 1.762944698837619, + "grad_norm": 0.09515856206417084, + "learning_rate": 3.4465043189709168e-06, + "loss": 0.0099, + "step": 30030 + }, + { + "epoch": 1.763531760009393, + "grad_norm": 0.44629839062690735, + "learning_rate": 3.429652746864187e-06, + "loss": 0.0089, + "step": 30040 + }, + { + "epoch": 1.764118821181167, + "grad_norm": 0.09620211273431778, + "learning_rate": 3.4128410101624817e-06, + "loss": 0.0093, + "step": 30050 + }, + { + "epoch": 1.7647058823529411, + "grad_norm": 4.587845325469971, + "learning_rate": 3.396069123246226e-06, + "loss": 0.0083, + "step": 30060 + }, + { + "epoch": 1.7652929435247153, + "grad_norm": 1.1827245950698853, + "learning_rate": 3.379337100461788e-06, + "loss": 0.0076, + "step": 30070 + }, + { + "epoch": 1.7658800046964895, + "grad_norm": 0.8300924897193909, + "learning_rate": 3.3626449561214245e-06, + "loss": 0.0062, + "step": 30080 + }, + { + "epoch": 1.7664670658682635, + "grad_norm": 0.024658210575580597, + "learning_rate": 3.3459927045032867e-06, + "loss": 0.0094, + "step": 30090 + }, + { + "epoch": 1.7670541270400375, + "grad_norm": 0.7816444635391235, + "learning_rate": 3.3293803598514086e-06, + "loss": 0.0219, + "step": 30100 + }, + { + "epoch": 1.7676411882118117, + "grad_norm": 3.3857486248016357, + "learning_rate": 3.312807936375656e-06, + "loss": 0.0043, + "step": 30110 + }, + { + "epoch": 1.768228249383586, + "grad_norm": 0.03308376297354698, + "learning_rate": 3.29627544825179e-06, + "loss": 0.0066, + "step": 30120 + }, + { + "epoch": 1.76881531055536, + "grad_norm": 0.03391629457473755, + "learning_rate": 3.2797829096213818e-06, + "loss": 0.007, + "step": 30130 + }, + { + "epoch": 1.7694023717271339, + "grad_norm": 2.114734411239624, + "learning_rate": 3.263330334591852e-06, + "loss": 0.0058, + "step": 30140 + }, + { + "epoch": 1.769989432898908, + "grad_norm": 0.18660058081150055, + "learning_rate": 3.246917737236416e-06, + "loss": 0.0022, + "step": 30150 + }, + { + "epoch": 1.7705764940706823, + "grad_norm": 0.5085205435752869, + "learning_rate": 3.2305451315941095e-06, + "loss": 0.0037, + "step": 30160 + }, + { + "epoch": 1.7711635552424563, + "grad_norm": 0.520336389541626, + "learning_rate": 3.2142125316697467e-06, + "loss": 0.008, + "step": 30170 + }, + { + "epoch": 1.7717506164142303, + "grad_norm": 0.6636307239532471, + "learning_rate": 3.1979199514339307e-06, + "loss": 0.0051, + "step": 30180 + }, + { + "epoch": 1.7723376775860045, + "grad_norm": 0.32753074169158936, + "learning_rate": 3.18166740482303e-06, + "loss": 0.021, + "step": 30190 + }, + { + "epoch": 1.7729247387577787, + "grad_norm": 0.15476436913013458, + "learning_rate": 3.1654549057391737e-06, + "loss": 0.0064, + "step": 30200 + }, + { + "epoch": 1.7735117999295527, + "grad_norm": 0.4490971267223358, + "learning_rate": 3.1492824680502244e-06, + "loss": 0.0178, + "step": 30210 + }, + { + "epoch": 1.7740988611013266, + "grad_norm": 0.7970400452613831, + "learning_rate": 3.1331501055897883e-06, + "loss": 0.0055, + "step": 30220 + }, + { + "epoch": 1.7746859222731008, + "grad_norm": 0.03366592898964882, + "learning_rate": 3.1170578321571887e-06, + "loss": 0.0084, + "step": 30230 + }, + { + "epoch": 1.775272983444875, + "grad_norm": 0.8563421368598938, + "learning_rate": 3.1010056615174365e-06, + "loss": 0.0086, + "step": 30240 + }, + { + "epoch": 1.775860044616649, + "grad_norm": 0.33078980445861816, + "learning_rate": 3.084993607401293e-06, + "loss": 0.0028, + "step": 30250 + }, + { + "epoch": 1.776447105788423, + "grad_norm": 1.528594732284546, + "learning_rate": 3.069021683505141e-06, + "loss": 0.0122, + "step": 30260 + }, + { + "epoch": 1.7770341669601972, + "grad_norm": 0.8881283402442932, + "learning_rate": 3.05308990349108e-06, + "loss": 0.015, + "step": 30270 + }, + { + "epoch": 1.7776212281319714, + "grad_norm": 0.05420268699526787, + "learning_rate": 3.0371982809868527e-06, + "loss": 0.0049, + "step": 30280 + }, + { + "epoch": 1.7782082893037454, + "grad_norm": 0.8013023138046265, + "learning_rate": 3.021346829585847e-06, + "loss": 0.0043, + "step": 30290 + }, + { + "epoch": 1.7787953504755194, + "grad_norm": 1.3829023838043213, + "learning_rate": 3.005535562847117e-06, + "loss": 0.0059, + "step": 30300 + }, + { + "epoch": 1.7793824116472936, + "grad_norm": 0.25038641691207886, + "learning_rate": 2.9897644942953162e-06, + "loss": 0.0142, + "step": 30310 + }, + { + "epoch": 1.7799694728190678, + "grad_norm": 1.0973732471466064, + "learning_rate": 2.9740336374207147e-06, + "loss": 0.0059, + "step": 30320 + }, + { + "epoch": 1.7805565339908418, + "grad_norm": 3.689282178878784, + "learning_rate": 2.9583430056792096e-06, + "loss": 0.0098, + "step": 30330 + }, + { + "epoch": 1.7811435951626158, + "grad_norm": 1.0719764232635498, + "learning_rate": 2.9426926124922592e-06, + "loss": 0.012, + "step": 30340 + }, + { + "epoch": 1.78173065633439, + "grad_norm": 0.21206724643707275, + "learning_rate": 2.927082471246917e-06, + "loss": 0.0115, + "step": 30350 + }, + { + "epoch": 1.7823177175061642, + "grad_norm": 1.1170448064804077, + "learning_rate": 2.911512595295818e-06, + "loss": 0.0078, + "step": 30360 + }, + { + "epoch": 1.7829047786779384, + "grad_norm": 0.8349093198776245, + "learning_rate": 2.8959829979571306e-06, + "loss": 0.0055, + "step": 30370 + }, + { + "epoch": 1.7834918398497124, + "grad_norm": 2.214409351348877, + "learning_rate": 2.880493692514602e-06, + "loss": 0.0079, + "step": 30380 + }, + { + "epoch": 1.7840789010214864, + "grad_norm": 0.5047364234924316, + "learning_rate": 2.8650446922174723e-06, + "loss": 0.0042, + "step": 30390 + }, + { + "epoch": 1.7846659621932606, + "grad_norm": 1.0878037214279175, + "learning_rate": 2.849636010280543e-06, + "loss": 0.0113, + "step": 30400 + }, + { + "epoch": 1.7852530233650348, + "grad_norm": 0.621374785900116, + "learning_rate": 2.8342676598841044e-06, + "loss": 0.0053, + "step": 30410 + }, + { + "epoch": 1.7858400845368088, + "grad_norm": 1.328795313835144, + "learning_rate": 2.818939654173952e-06, + "loss": 0.016, + "step": 30420 + }, + { + "epoch": 1.7864271457085827, + "grad_norm": 0.5891567468643188, + "learning_rate": 2.803652006261387e-06, + "loss": 0.0082, + "step": 30430 + }, + { + "epoch": 1.787014206880357, + "grad_norm": 1.2182434797286987, + "learning_rate": 2.7884047292231817e-06, + "loss": 0.0109, + "step": 30440 + }, + { + "epoch": 1.7876012680521312, + "grad_norm": 1.2911088466644287, + "learning_rate": 2.7731978361015543e-06, + "loss": 0.0237, + "step": 30450 + }, + { + "epoch": 1.7881883292239051, + "grad_norm": 0.37283286452293396, + "learning_rate": 2.75803133990421e-06, + "loss": 0.0095, + "step": 30460 + }, + { + "epoch": 1.7887753903956791, + "grad_norm": 0.6658843159675598, + "learning_rate": 2.742905253604272e-06, + "loss": 0.0017, + "step": 30470 + }, + { + "epoch": 1.7893624515674533, + "grad_norm": 0.2888129949569702, + "learning_rate": 2.727819590140335e-06, + "loss": 0.0072, + "step": 30480 + }, + { + "epoch": 1.7899495127392275, + "grad_norm": 0.9133905172348022, + "learning_rate": 2.712774362416376e-06, + "loss": 0.0117, + "step": 30490 + }, + { + "epoch": 1.7905365739110015, + "grad_norm": 0.02645767293870449, + "learning_rate": 2.6977695833018014e-06, + "loss": 0.0066, + "step": 30500 + }, + { + "epoch": 1.7911236350827755, + "grad_norm": 0.7585176229476929, + "learning_rate": 2.6828052656314384e-06, + "loss": 0.004, + "step": 30510 + }, + { + "epoch": 1.7917106962545497, + "grad_norm": 0.5185384750366211, + "learning_rate": 2.6678814222054593e-06, + "loss": 0.0022, + "step": 30520 + }, + { + "epoch": 1.792297757426324, + "grad_norm": 1.7649006843566895, + "learning_rate": 2.652998065789453e-06, + "loss": 0.003, + "step": 30530 + }, + { + "epoch": 1.792884818598098, + "grad_norm": 0.07767531275749207, + "learning_rate": 2.638155209114368e-06, + "loss": 0.0062, + "step": 30540 + }, + { + "epoch": 1.7934718797698719, + "grad_norm": 1.7642700672149658, + "learning_rate": 2.623352864876505e-06, + "loss": 0.0042, + "step": 30550 + }, + { + "epoch": 1.794058940941646, + "grad_norm": 2.1818125247955322, + "learning_rate": 2.6085910457375073e-06, + "loss": 0.0127, + "step": 30560 + }, + { + "epoch": 1.7946460021134203, + "grad_norm": 0.2745802402496338, + "learning_rate": 2.5938697643243635e-06, + "loss": 0.0067, + "step": 30570 + }, + { + "epoch": 1.7952330632851943, + "grad_norm": 0.0070999846793711185, + "learning_rate": 2.5791890332293788e-06, + "loss": 0.0073, + "step": 30580 + }, + { + "epoch": 1.7958201244569683, + "grad_norm": 0.15600904822349548, + "learning_rate": 2.56454886501018e-06, + "loss": 0.0079, + "step": 30590 + }, + { + "epoch": 1.7964071856287425, + "grad_norm": 0.30416813492774963, + "learning_rate": 2.5499492721896887e-06, + "loss": 0.0103, + "step": 30600 + }, + { + "epoch": 1.7969942468005167, + "grad_norm": 0.14762407541275024, + "learning_rate": 2.535390267256138e-06, + "loss": 0.0072, + "step": 30610 + }, + { + "epoch": 1.7975813079722909, + "grad_norm": 0.7133505940437317, + "learning_rate": 2.5208718626630045e-06, + "loss": 0.0128, + "step": 30620 + }, + { + "epoch": 1.7981683691440649, + "grad_norm": 0.5686302185058594, + "learning_rate": 2.5063940708290823e-06, + "loss": 0.0163, + "step": 30630 + }, + { + "epoch": 1.7987554303158388, + "grad_norm": 1.847533941268921, + "learning_rate": 2.491956904138393e-06, + "loss": 0.0067, + "step": 30640 + }, + { + "epoch": 1.799342491487613, + "grad_norm": 1.142364740371704, + "learning_rate": 2.4775603749402187e-06, + "loss": 0.0143, + "step": 30650 + }, + { + "epoch": 1.7999295526593873, + "grad_norm": 1.1415369510650635, + "learning_rate": 2.4632044955490983e-06, + "loss": 0.0041, + "step": 30660 + }, + { + "epoch": 1.8005166138311612, + "grad_norm": 1.596919298171997, + "learning_rate": 2.4488892782447593e-06, + "loss": 0.0059, + "step": 30670 + }, + { + "epoch": 1.8011036750029352, + "grad_norm": 1.4813252687454224, + "learning_rate": 2.4346147352721836e-06, + "loss": 0.0076, + "step": 30680 + }, + { + "epoch": 1.8016907361747094, + "grad_norm": 0.3757873475551605, + "learning_rate": 2.4203808788415438e-06, + "loss": 0.0068, + "step": 30690 + }, + { + "epoch": 1.8022777973464836, + "grad_norm": 1.8511054515838623, + "learning_rate": 2.406187721128217e-06, + "loss": 0.0167, + "step": 30700 + }, + { + "epoch": 1.8028648585182576, + "grad_norm": 0.3293643295764923, + "learning_rate": 2.3920352742727636e-06, + "loss": 0.0035, + "step": 30710 + }, + { + "epoch": 1.8034519196900316, + "grad_norm": 3.0010180473327637, + "learning_rate": 2.377923550380934e-06, + "loss": 0.013, + "step": 30720 + }, + { + "epoch": 1.8040389808618058, + "grad_norm": 0.07963775843381882, + "learning_rate": 2.3638525615236164e-06, + "loss": 0.0118, + "step": 30730 + }, + { + "epoch": 1.80462604203358, + "grad_norm": 0.4447624683380127, + "learning_rate": 2.3498223197368828e-06, + "loss": 0.0094, + "step": 30740 + }, + { + "epoch": 1.805213103205354, + "grad_norm": 2.473619222640991, + "learning_rate": 2.3358328370219286e-06, + "loss": 0.0083, + "step": 30750 + }, + { + "epoch": 1.805800164377128, + "grad_norm": 0.6622852683067322, + "learning_rate": 2.3218841253451084e-06, + "loss": 0.0042, + "step": 30760 + }, + { + "epoch": 1.8063872255489022, + "grad_norm": 1.3558241128921509, + "learning_rate": 2.3079761966378787e-06, + "loss": 0.0063, + "step": 30770 + }, + { + "epoch": 1.8069742867206764, + "grad_norm": 0.33775874972343445, + "learning_rate": 2.2941090627968287e-06, + "loss": 0.002, + "step": 30780 + }, + { + "epoch": 1.8075613478924504, + "grad_norm": 0.08536079525947571, + "learning_rate": 2.280282735683653e-06, + "loss": 0.0056, + "step": 30790 + }, + { + "epoch": 1.8081484090642244, + "grad_norm": 0.5035672187805176, + "learning_rate": 2.266497227125114e-06, + "loss": 0.01, + "step": 30800 + }, + { + "epoch": 1.8087354702359986, + "grad_norm": 0.8908909559249878, + "learning_rate": 2.2527525489131008e-06, + "loss": 0.0125, + "step": 30810 + }, + { + "epoch": 1.8093225314077728, + "grad_norm": 0.0050659808330237865, + "learning_rate": 2.2390487128045256e-06, + "loss": 0.0136, + "step": 30820 + }, + { + "epoch": 1.8099095925795468, + "grad_norm": 1.6174921989440918, + "learning_rate": 2.2253857305214233e-06, + "loss": 0.0143, + "step": 30830 + }, + { + "epoch": 1.8104966537513207, + "grad_norm": 1.1043347120285034, + "learning_rate": 2.211763613750839e-06, + "loss": 0.0058, + "step": 30840 + }, + { + "epoch": 1.811083714923095, + "grad_norm": 0.1730615496635437, + "learning_rate": 2.1981823741448805e-06, + "loss": 0.0077, + "step": 30850 + }, + { + "epoch": 1.8116707760948692, + "grad_norm": 0.040801744908094406, + "learning_rate": 2.1846420233206823e-06, + "loss": 0.0056, + "step": 30860 + }, + { + "epoch": 1.8122578372666431, + "grad_norm": 0.040723178535699844, + "learning_rate": 2.1711425728604073e-06, + "loss": 0.0087, + "step": 30870 + }, + { + "epoch": 1.8128448984384171, + "grad_norm": 0.3218896985054016, + "learning_rate": 2.1576840343112414e-06, + "loss": 0.006, + "step": 30880 + }, + { + "epoch": 1.8134319596101913, + "grad_norm": 1.7537883520126343, + "learning_rate": 2.1442664191853645e-06, + "loss": 0.0051, + "step": 30890 + }, + { + "epoch": 1.8140190207819655, + "grad_norm": 1.382644772529602, + "learning_rate": 2.130889738959946e-06, + "loss": 0.0103, + "step": 30900 + }, + { + "epoch": 1.8146060819537397, + "grad_norm": 0.9474086165428162, + "learning_rate": 2.1175540050771492e-06, + "loss": 0.0128, + "step": 30910 + }, + { + "epoch": 1.8151931431255137, + "grad_norm": 0.175063356757164, + "learning_rate": 2.1042592289441277e-06, + "loss": 0.0038, + "step": 30920 + }, + { + "epoch": 1.8157802042972877, + "grad_norm": 1.8063702583312988, + "learning_rate": 2.0910054219329624e-06, + "loss": 0.0079, + "step": 30930 + }, + { + "epoch": 1.816367265469062, + "grad_norm": 0.24421249330043793, + "learning_rate": 2.0777925953807288e-06, + "loss": 0.0115, + "step": 30940 + }, + { + "epoch": 1.8169543266408361, + "grad_norm": 0.6284900307655334, + "learning_rate": 2.0646207605894198e-06, + "loss": 0.0171, + "step": 30950 + }, + { + "epoch": 1.81754138781261, + "grad_norm": 0.24449436366558075, + "learning_rate": 2.051489928825995e-06, + "loss": 0.0069, + "step": 30960 + }, + { + "epoch": 1.818128448984384, + "grad_norm": 0.020610427483916283, + "learning_rate": 2.0384001113222972e-06, + "loss": 0.0046, + "step": 30970 + }, + { + "epoch": 1.8187155101561583, + "grad_norm": 0.03185024484992027, + "learning_rate": 2.0253513192751373e-06, + "loss": 0.0107, + "step": 30980 + }, + { + "epoch": 1.8193025713279325, + "grad_norm": 0.3091789186000824, + "learning_rate": 2.0123435638461863e-06, + "loss": 0.0079, + "step": 30990 + }, + { + "epoch": 1.8198896324997065, + "grad_norm": 0.3415491282939911, + "learning_rate": 1.999376856162044e-06, + "loss": 0.0087, + "step": 31000 + }, + { + "epoch": 1.8204766936714805, + "grad_norm": 0.35340416431427, + "learning_rate": 1.986451207314194e-06, + "loss": 0.0058, + "step": 31010 + }, + { + "epoch": 1.8210637548432547, + "grad_norm": 0.31529369950294495, + "learning_rate": 1.9735666283589972e-06, + "loss": 0.009, + "step": 31020 + }, + { + "epoch": 1.8216508160150289, + "grad_norm": 0.44021913409233093, + "learning_rate": 1.9607231303176653e-06, + "loss": 0.0021, + "step": 31030 + }, + { + "epoch": 1.8222378771868029, + "grad_norm": 0.567622721195221, + "learning_rate": 1.9479207241763055e-06, + "loss": 0.0079, + "step": 31040 + }, + { + "epoch": 1.8228249383585768, + "grad_norm": 0.35683673620224, + "learning_rate": 1.9351594208858405e-06, + "loss": 0.0025, + "step": 31050 + }, + { + "epoch": 1.823411999530351, + "grad_norm": 0.4013507664203644, + "learning_rate": 1.9224392313620665e-06, + "loss": 0.0101, + "step": 31060 + }, + { + "epoch": 1.8239990607021253, + "grad_norm": 0.07524754852056503, + "learning_rate": 1.909760166485586e-06, + "loss": 0.0078, + "step": 31070 + }, + { + "epoch": 1.8245861218738992, + "grad_norm": 0.257982075214386, + "learning_rate": 1.8971222371018393e-06, + "loss": 0.0039, + "step": 31080 + }, + { + "epoch": 1.8251731830456732, + "grad_norm": 0.06671323627233505, + "learning_rate": 1.8845254540210743e-06, + "loss": 0.0076, + "step": 31090 + }, + { + "epoch": 1.8257602442174474, + "grad_norm": 0.3007630407810211, + "learning_rate": 1.8719698280183328e-06, + "loss": 0.0098, + "step": 31100 + }, + { + "epoch": 1.8263473053892216, + "grad_norm": 0.9302312135696411, + "learning_rate": 1.8594553698334793e-06, + "loss": 0.0077, + "step": 31110 + }, + { + "epoch": 1.8269343665609956, + "grad_norm": 1.2461236715316772, + "learning_rate": 1.8469820901711344e-06, + "loss": 0.0079, + "step": 31120 + }, + { + "epoch": 1.8275214277327696, + "grad_norm": 0.025279074907302856, + "learning_rate": 1.8345499997007243e-06, + "loss": 0.0033, + "step": 31130 + }, + { + "epoch": 1.8281084889045438, + "grad_norm": 0.33775296807289124, + "learning_rate": 1.8221591090564038e-06, + "loss": 0.0105, + "step": 31140 + }, + { + "epoch": 1.828695550076318, + "grad_norm": 0.12173739075660706, + "learning_rate": 1.8098094288371336e-06, + "loss": 0.0077, + "step": 31150 + }, + { + "epoch": 1.8292826112480922, + "grad_norm": 0.01120838150382042, + "learning_rate": 1.7975009696065859e-06, + "loss": 0.0254, + "step": 31160 + }, + { + "epoch": 1.8298696724198662, + "grad_norm": 0.2868688106536865, + "learning_rate": 1.785233741893183e-06, + "loss": 0.003, + "step": 31170 + }, + { + "epoch": 1.8304567335916402, + "grad_norm": 0.06971792131662369, + "learning_rate": 1.7730077561900926e-06, + "loss": 0.027, + "step": 31180 + }, + { + "epoch": 1.8310437947634144, + "grad_norm": 0.09698063135147095, + "learning_rate": 1.760823022955188e-06, + "loss": 0.01, + "step": 31190 + }, + { + "epoch": 1.8316308559351886, + "grad_norm": 0.0902014970779419, + "learning_rate": 1.748679552611071e-06, + "loss": 0.0241, + "step": 31200 + }, + { + "epoch": 1.8322179171069626, + "grad_norm": 1.3793613910675049, + "learning_rate": 1.736577355545027e-06, + "loss": 0.0075, + "step": 31210 + }, + { + "epoch": 1.8328049782787366, + "grad_norm": 1.8001060485839844, + "learning_rate": 1.7245164421090533e-06, + "loss": 0.0266, + "step": 31220 + }, + { + "epoch": 1.8333920394505108, + "grad_norm": 0.11762313544750214, + "learning_rate": 1.7124968226198357e-06, + "loss": 0.0093, + "step": 31230 + }, + { + "epoch": 1.833979100622285, + "grad_norm": 0.47632017731666565, + "learning_rate": 1.7005185073587337e-06, + "loss": 0.0097, + "step": 31240 + }, + { + "epoch": 1.834566161794059, + "grad_norm": 0.12678590416908264, + "learning_rate": 1.6885815065717625e-06, + "loss": 0.0081, + "step": 31250 + }, + { + "epoch": 1.835153222965833, + "grad_norm": 0.2040066421031952, + "learning_rate": 1.676685830469621e-06, + "loss": 0.0047, + "step": 31260 + }, + { + "epoch": 1.8357402841376071, + "grad_norm": 1.1790368556976318, + "learning_rate": 1.6648314892276362e-06, + "loss": 0.0102, + "step": 31270 + }, + { + "epoch": 1.8363273453093814, + "grad_norm": 0.33532023429870605, + "learning_rate": 1.6530184929857973e-06, + "loss": 0.0092, + "step": 31280 + }, + { + "epoch": 1.8369144064811553, + "grad_norm": 1.6581460237503052, + "learning_rate": 1.6412468518487212e-06, + "loss": 0.0061, + "step": 31290 + }, + { + "epoch": 1.8375014676529293, + "grad_norm": 0.6531566381454468, + "learning_rate": 1.629516575885659e-06, + "loss": 0.0061, + "step": 31300 + }, + { + "epoch": 1.8380885288247035, + "grad_norm": 0.014209272339940071, + "learning_rate": 1.617827675130451e-06, + "loss": 0.0124, + "step": 31310 + }, + { + "epoch": 1.8386755899964777, + "grad_norm": 0.8777631521224976, + "learning_rate": 1.6061801595815774e-06, + "loss": 0.0073, + "step": 31320 + }, + { + "epoch": 1.8392626511682517, + "grad_norm": 0.019044015556573868, + "learning_rate": 1.5945740392021013e-06, + "loss": 0.0037, + "step": 31330 + }, + { + "epoch": 1.8398497123400257, + "grad_norm": 0.193769633769989, + "learning_rate": 1.5830093239196764e-06, + "loss": 0.0148, + "step": 31340 + }, + { + "epoch": 1.8404367735118, + "grad_norm": 1.4542549848556519, + "learning_rate": 1.5714860236265506e-06, + "loss": 0.0049, + "step": 31350 + }, + { + "epoch": 1.8410238346835741, + "grad_norm": 0.3220404088497162, + "learning_rate": 1.5600041481795336e-06, + "loss": 0.0086, + "step": 31360 + }, + { + "epoch": 1.841610895855348, + "grad_norm": 1.2098355293273926, + "learning_rate": 1.5485637074000247e-06, + "loss": 0.0236, + "step": 31370 + }, + { + "epoch": 1.842197957027122, + "grad_norm": 1.4953199625015259, + "learning_rate": 1.5371647110739408e-06, + "loss": 0.0068, + "step": 31380 + }, + { + "epoch": 1.8427850181988963, + "grad_norm": 2.258049249649048, + "learning_rate": 1.5258071689517872e-06, + "loss": 0.0106, + "step": 31390 + }, + { + "epoch": 1.8433720793706705, + "grad_norm": 2.7513246536254883, + "learning_rate": 1.514491090748571e-06, + "loss": 0.0147, + "step": 31400 + }, + { + "epoch": 1.8439591405424445, + "grad_norm": 0.9112816452980042, + "learning_rate": 1.5032164861438825e-06, + "loss": 0.0022, + "step": 31410 + }, + { + "epoch": 1.8445462017142185, + "grad_norm": 0.37844619154930115, + "learning_rate": 1.4919833647817905e-06, + "loss": 0.0032, + "step": 31420 + }, + { + "epoch": 1.8451332628859927, + "grad_norm": 1.8407471179962158, + "learning_rate": 1.4807917362709033e-06, + "loss": 0.0159, + "step": 31430 + }, + { + "epoch": 1.8457203240577669, + "grad_norm": 1.7563832998275757, + "learning_rate": 1.4696416101843246e-06, + "loss": 0.0042, + "step": 31440 + }, + { + "epoch": 1.846307385229541, + "grad_norm": 1.1273623704910278, + "learning_rate": 1.4585329960596639e-06, + "loss": 0.0227, + "step": 31450 + }, + { + "epoch": 1.846894446401315, + "grad_norm": 0.02268887870013714, + "learning_rate": 1.4474659033990313e-06, + "loss": 0.0028, + "step": 31460 + }, + { + "epoch": 1.847481507573089, + "grad_norm": 0.5669854879379272, + "learning_rate": 1.4364403416690042e-06, + "loss": 0.0084, + "step": 31470 + }, + { + "epoch": 1.8480685687448632, + "grad_norm": 0.17324286699295044, + "learning_rate": 1.42545632030065e-06, + "loss": 0.0087, + "step": 31480 + }, + { + "epoch": 1.8486556299166375, + "grad_norm": 0.06148910894989967, + "learning_rate": 1.4145138486894804e-06, + "loss": 0.0085, + "step": 31490 + }, + { + "epoch": 1.8492426910884114, + "grad_norm": 2.422051191329956, + "learning_rate": 1.4036129361954974e-06, + "loss": 0.0189, + "step": 31500 + }, + { + "epoch": 1.8498297522601854, + "grad_norm": 0.03764136880636215, + "learning_rate": 1.3927535921431255e-06, + "loss": 0.0038, + "step": 31510 + }, + { + "epoch": 1.8504168134319596, + "grad_norm": 0.2697269320487976, + "learning_rate": 1.381935825821251e-06, + "loss": 0.0096, + "step": 31520 + }, + { + "epoch": 1.8510038746037338, + "grad_norm": 0.9931702613830566, + "learning_rate": 1.371159646483189e-06, + "loss": 0.0193, + "step": 31530 + }, + { + "epoch": 1.8515909357755078, + "grad_norm": 0.14079973101615906, + "learning_rate": 1.360425063346682e-06, + "loss": 0.0106, + "step": 31540 + }, + { + "epoch": 1.8521779969472818, + "grad_norm": 0.838455855846405, + "learning_rate": 1.3497320855938855e-06, + "loss": 0.0138, + "step": 31550 + }, + { + "epoch": 1.852765058119056, + "grad_norm": 0.6983908414840698, + "learning_rate": 1.3390807223713886e-06, + "loss": 0.0088, + "step": 31560 + }, + { + "epoch": 1.8533521192908302, + "grad_norm": 0.18073470890522003, + "learning_rate": 1.328470982790142e-06, + "loss": 0.0028, + "step": 31570 + }, + { + "epoch": 1.8539391804626042, + "grad_norm": 0.08848128467798233, + "learning_rate": 1.3179028759255475e-06, + "loss": 0.004, + "step": 31580 + }, + { + "epoch": 1.8545262416343782, + "grad_norm": 0.6067091226577759, + "learning_rate": 1.3073764108173459e-06, + "loss": 0.0178, + "step": 31590 + }, + { + "epoch": 1.8551133028061524, + "grad_norm": 0.6836615204811096, + "learning_rate": 1.2968915964696904e-06, + "loss": 0.0018, + "step": 31600 + }, + { + "epoch": 1.8557003639779266, + "grad_norm": 0.6487021446228027, + "learning_rate": 1.2864484418510959e-06, + "loss": 0.0193, + "step": 31610 + }, + { + "epoch": 1.8562874251497006, + "grad_norm": 0.6123121976852417, + "learning_rate": 1.2760469558944277e-06, + "loss": 0.0094, + "step": 31620 + }, + { + "epoch": 1.8568744863214746, + "grad_norm": 2.8882272243499756, + "learning_rate": 1.2656871474969357e-06, + "loss": 0.0119, + "step": 31630 + }, + { + "epoch": 1.8574615474932488, + "grad_norm": 1.7780054807662964, + "learning_rate": 1.2553690255201977e-06, + "loss": 0.0137, + "step": 31640 + }, + { + "epoch": 1.858048608665023, + "grad_norm": 0.5469017028808594, + "learning_rate": 1.2450925987901595e-06, + "loss": 0.0081, + "step": 31650 + }, + { + "epoch": 1.858635669836797, + "grad_norm": 0.0939243733882904, + "learning_rate": 1.234857876097062e-06, + "loss": 0.0096, + "step": 31660 + }, + { + "epoch": 1.859222731008571, + "grad_norm": 0.1904410570859909, + "learning_rate": 1.224664866195513e-06, + "loss": 0.0021, + "step": 31670 + }, + { + "epoch": 1.8598097921803451, + "grad_norm": 0.26256752014160156, + "learning_rate": 1.214513577804416e-06, + "loss": 0.0267, + "step": 31680 + }, + { + "epoch": 1.8603968533521194, + "grad_norm": 0.22087575495243073, + "learning_rate": 1.204404019606986e-06, + "loss": 0.0191, + "step": 31690 + }, + { + "epoch": 1.8609839145238933, + "grad_norm": 0.507247805595398, + "learning_rate": 1.194336200250762e-06, + "loss": 0.0023, + "step": 31700 + }, + { + "epoch": 1.8615709756956675, + "grad_norm": 1.6338211297988892, + "learning_rate": 1.1843101283475655e-06, + "loss": 0.0091, + "step": 31710 + }, + { + "epoch": 1.8621580368674415, + "grad_norm": 0.023288412019610405, + "learning_rate": 1.174325812473509e-06, + "loss": 0.0101, + "step": 31720 + }, + { + "epoch": 1.8627450980392157, + "grad_norm": 3.4685113430023193, + "learning_rate": 1.1643832611689943e-06, + "loss": 0.0055, + "step": 31730 + }, + { + "epoch": 1.86333215921099, + "grad_norm": 0.18406684696674347, + "learning_rate": 1.1544824829386846e-06, + "loss": 0.01, + "step": 31740 + }, + { + "epoch": 1.863919220382764, + "grad_norm": 0.006672476418316364, + "learning_rate": 1.1446234862515225e-06, + "loss": 0.0034, + "step": 31750 + }, + { + "epoch": 1.864506281554538, + "grad_norm": 0.5018321871757507, + "learning_rate": 1.1348062795407233e-06, + "loss": 0.0067, + "step": 31760 + }, + { + "epoch": 1.865093342726312, + "grad_norm": 1.099258303642273, + "learning_rate": 1.1250308712037306e-06, + "loss": 0.0115, + "step": 31770 + }, + { + "epoch": 1.8656804038980863, + "grad_norm": 0.03937417268753052, + "learning_rate": 1.1152972696022445e-06, + "loss": 0.0028, + "step": 31780 + }, + { + "epoch": 1.8662674650698603, + "grad_norm": 0.09545544534921646, + "learning_rate": 1.105605483062211e-06, + "loss": 0.0215, + "step": 31790 + }, + { + "epoch": 1.8668545262416343, + "grad_norm": 0.21313613653182983, + "learning_rate": 1.0959555198738037e-06, + "loss": 0.0022, + "step": 31800 + }, + { + "epoch": 1.8674415874134085, + "grad_norm": 0.9208429455757141, + "learning_rate": 1.0863473882914143e-06, + "loss": 0.0061, + "step": 31810 + }, + { + "epoch": 1.8680286485851827, + "grad_norm": 0.745741605758667, + "learning_rate": 1.076781096533669e-06, + "loss": 0.0231, + "step": 31820 + }, + { + "epoch": 1.8686157097569567, + "grad_norm": 0.42319658398628235, + "learning_rate": 1.0672566527833827e-06, + "loss": 0.0133, + "step": 31830 + }, + { + "epoch": 1.8692027709287307, + "grad_norm": 0.4425826072692871, + "learning_rate": 1.0577740651876001e-06, + "loss": 0.0118, + "step": 31840 + }, + { + "epoch": 1.8697898321005049, + "grad_norm": 0.39463743567466736, + "learning_rate": 1.048333341857538e-06, + "loss": 0.0017, + "step": 31850 + }, + { + "epoch": 1.870376893272279, + "grad_norm": 0.3169250190258026, + "learning_rate": 1.0389344908686205e-06, + "loss": 0.0086, + "step": 31860 + }, + { + "epoch": 1.870963954444053, + "grad_norm": 1.0209932327270508, + "learning_rate": 1.0295775202604495e-06, + "loss": 0.0105, + "step": 31870 + }, + { + "epoch": 1.871551015615827, + "grad_norm": 0.009926820173859596, + "learning_rate": 1.020262438036801e-06, + "loss": 0.0028, + "step": 31880 + }, + { + "epoch": 1.8721380767876012, + "grad_norm": 1.3148220777511597, + "learning_rate": 1.0109892521656283e-06, + "loss": 0.0065, + "step": 31890 + }, + { + "epoch": 1.8727251379593755, + "grad_norm": 1.018448829650879, + "learning_rate": 1.0017579705790314e-06, + "loss": 0.0122, + "step": 31900 + }, + { + "epoch": 1.8733121991311494, + "grad_norm": 0.7915276885032654, + "learning_rate": 9.925686011732826e-07, + "loss": 0.012, + "step": 31910 + }, + { + "epoch": 1.8738992603029234, + "grad_norm": 0.25855815410614014, + "learning_rate": 9.834211518087887e-07, + "loss": 0.0028, + "step": 31920 + }, + { + "epoch": 1.8744863214746976, + "grad_norm": 0.4287143647670746, + "learning_rate": 9.743156303101185e-07, + "loss": 0.008, + "step": 31930 + }, + { + "epoch": 1.8750733826464718, + "grad_norm": 1.286320447921753, + "learning_rate": 9.652520444659585e-07, + "loss": 0.0135, + "step": 31940 + }, + { + "epoch": 1.8756604438182458, + "grad_norm": 0.2922452688217163, + "learning_rate": 9.562304020291346e-07, + "loss": 0.0097, + "step": 31950 + }, + { + "epoch": 1.8762475049900198, + "grad_norm": 0.023812199011445045, + "learning_rate": 9.472507107165852e-07, + "loss": 0.0032, + "step": 31960 + }, + { + "epoch": 1.876834566161794, + "grad_norm": 0.014829243533313274, + "learning_rate": 9.383129782093713e-07, + "loss": 0.0047, + "step": 31970 + }, + { + "epoch": 1.8774216273335682, + "grad_norm": 0.3453643023967743, + "learning_rate": 9.294172121526668e-07, + "loss": 0.0108, + "step": 31980 + }, + { + "epoch": 1.8780086885053424, + "grad_norm": 0.00518003711476922, + "learning_rate": 9.205634201557456e-07, + "loss": 0.0043, + "step": 31990 + }, + { + "epoch": 1.8785957496771164, + "grad_norm": 0.12136801332235336, + "learning_rate": 9.11751609791972e-07, + "loss": 0.0088, + "step": 32000 + }, + { + "epoch": 1.8791828108488904, + "grad_norm": 1.9268333911895752, + "learning_rate": 9.029817885988001e-07, + "loss": 0.006, + "step": 32010 + }, + { + "epoch": 1.8797698720206646, + "grad_norm": 0.05153471976518631, + "learning_rate": 8.942539640777792e-07, + "loss": 0.0064, + "step": 32020 + }, + { + "epoch": 1.8803569331924388, + "grad_norm": 1.7470871210098267, + "learning_rate": 8.855681436945206e-07, + "loss": 0.0045, + "step": 32030 + }, + { + "epoch": 1.8809439943642128, + "grad_norm": 0.8505917191505432, + "learning_rate": 8.769243348787148e-07, + "loss": 0.0111, + "step": 32040 + }, + { + "epoch": 1.8815310555359868, + "grad_norm": 0.028677623718976974, + "learning_rate": 8.683225450241139e-07, + "loss": 0.0073, + "step": 32050 + }, + { + "epoch": 1.882118116707761, + "grad_norm": 0.007410742342472076, + "learning_rate": 8.597627814885323e-07, + "loss": 0.0085, + "step": 32060 + }, + { + "epoch": 1.8827051778795352, + "grad_norm": 1.685816764831543, + "learning_rate": 8.512450515938298e-07, + "loss": 0.0139, + "step": 32070 + }, + { + "epoch": 1.8832922390513092, + "grad_norm": 2.025792360305786, + "learning_rate": 8.427693626259114e-07, + "loss": 0.0088, + "step": 32080 + }, + { + "epoch": 1.8838793002230831, + "grad_norm": 0.26576316356658936, + "learning_rate": 8.343357218347226e-07, + "loss": 0.0056, + "step": 32090 + }, + { + "epoch": 1.8844663613948573, + "grad_norm": 0.031764790415763855, + "learning_rate": 8.25944136434248e-07, + "loss": 0.0025, + "step": 32100 + }, + { + "epoch": 1.8850534225666316, + "grad_norm": 0.006499218754470348, + "learning_rate": 8.175946136024792e-07, + "loss": 0.0046, + "step": 32110 + }, + { + "epoch": 1.8856404837384055, + "grad_norm": 1.3531373739242554, + "learning_rate": 8.092871604814645e-07, + "loss": 0.0119, + "step": 32120 + }, + { + "epoch": 1.8862275449101795, + "grad_norm": 0.14757950603961945, + "learning_rate": 8.01021784177225e-07, + "loss": 0.006, + "step": 32130 + }, + { + "epoch": 1.8868146060819537, + "grad_norm": 0.24395480751991272, + "learning_rate": 7.927984917598164e-07, + "loss": 0.0121, + "step": 32140 + }, + { + "epoch": 1.887401667253728, + "grad_norm": 3.652184247970581, + "learning_rate": 7.846172902632842e-07, + "loss": 0.009, + "step": 32150 + }, + { + "epoch": 1.887988728425502, + "grad_norm": 3.0004146099090576, + "learning_rate": 7.764781866856808e-07, + "loss": 0.0117, + "step": 32160 + }, + { + "epoch": 1.888575789597276, + "grad_norm": 0.6791203022003174, + "learning_rate": 7.683811879890479e-07, + "loss": 0.0074, + "step": 32170 + }, + { + "epoch": 1.88916285076905, + "grad_norm": 0.012861160561442375, + "learning_rate": 7.603263010993955e-07, + "loss": 0.0058, + "step": 32180 + }, + { + "epoch": 1.8897499119408243, + "grad_norm": 0.12536686658859253, + "learning_rate": 7.523135329067343e-07, + "loss": 0.0023, + "step": 32190 + }, + { + "epoch": 1.8903369731125983, + "grad_norm": 0.11532700061798096, + "learning_rate": 7.443428902650262e-07, + "loss": 0.0074, + "step": 32200 + }, + { + "epoch": 1.8909240342843723, + "grad_norm": 0.24618609249591827, + "learning_rate": 7.364143799922119e-07, + "loss": 0.0063, + "step": 32210 + }, + { + "epoch": 1.8915110954561465, + "grad_norm": 1.8910354375839233, + "learning_rate": 7.285280088701996e-07, + "loss": 0.0099, + "step": 32220 + }, + { + "epoch": 1.8920981566279207, + "grad_norm": 0.39975517988204956, + "learning_rate": 7.206837836448377e-07, + "loss": 0.0044, + "step": 32230 + }, + { + "epoch": 1.8926852177996947, + "grad_norm": 0.0536993108689785, + "learning_rate": 7.128817110259312e-07, + "loss": 0.0094, + "step": 32240 + }, + { + "epoch": 1.8932722789714689, + "grad_norm": 0.5875653624534607, + "learning_rate": 7.051217976872248e-07, + "loss": 0.0022, + "step": 32250 + }, + { + "epoch": 1.8938593401432429, + "grad_norm": 0.9675849676132202, + "learning_rate": 6.974040502664092e-07, + "loss": 0.0065, + "step": 32260 + }, + { + "epoch": 1.894446401315017, + "grad_norm": 0.06338284909725189, + "learning_rate": 6.897284753650924e-07, + "loss": 0.0193, + "step": 32270 + }, + { + "epoch": 1.8950334624867913, + "grad_norm": 1.0650804042816162, + "learning_rate": 6.820950795488223e-07, + "loss": 0.0064, + "step": 32280 + }, + { + "epoch": 1.8956205236585653, + "grad_norm": 0.032856181263923645, + "learning_rate": 6.745038693470651e-07, + "loss": 0.0124, + "step": 32290 + }, + { + "epoch": 1.8962075848303392, + "grad_norm": 0.3160998821258545, + "learning_rate": 6.669548512531986e-07, + "loss": 0.0074, + "step": 32300 + }, + { + "epoch": 1.8967946460021134, + "grad_norm": 0.4787406325340271, + "learning_rate": 6.594480317245133e-07, + "loss": 0.0076, + "step": 32310 + }, + { + "epoch": 1.8973817071738877, + "grad_norm": 0.12755951285362244, + "learning_rate": 6.519834171822003e-07, + "loss": 0.0068, + "step": 32320 + }, + { + "epoch": 1.8979687683456616, + "grad_norm": 0.119247205555439, + "learning_rate": 6.445610140113467e-07, + "loss": 0.004, + "step": 32330 + }, + { + "epoch": 1.8985558295174356, + "grad_norm": 0.19741986691951752, + "learning_rate": 6.371808285609515e-07, + "loss": 0.0078, + "step": 32340 + }, + { + "epoch": 1.8991428906892098, + "grad_norm": 2.4395265579223633, + "learning_rate": 6.298428671438705e-07, + "loss": 0.0024, + "step": 32350 + }, + { + "epoch": 1.899729951860984, + "grad_norm": 0.23129431903362274, + "learning_rate": 6.225471360368773e-07, + "loss": 0.007, + "step": 32360 + }, + { + "epoch": 1.900317013032758, + "grad_norm": 0.9480741620063782, + "learning_rate": 6.152936414805854e-07, + "loss": 0.0132, + "step": 32370 + }, + { + "epoch": 1.900904074204532, + "grad_norm": 2.785883903503418, + "learning_rate": 6.080823896795095e-07, + "loss": 0.0114, + "step": 32380 + }, + { + "epoch": 1.9014911353763062, + "grad_norm": 3.1118991374969482, + "learning_rate": 6.009133868020156e-07, + "loss": 0.006, + "step": 32390 + }, + { + "epoch": 1.9020781965480804, + "grad_norm": 0.6544510126113892, + "learning_rate": 5.93786638980337e-07, + "loss": 0.0062, + "step": 32400 + }, + { + "epoch": 1.9026652577198544, + "grad_norm": 0.31970447301864624, + "learning_rate": 5.867021523105587e-07, + "loss": 0.0047, + "step": 32410 + }, + { + "epoch": 1.9032523188916284, + "grad_norm": 0.0359501838684082, + "learning_rate": 5.796599328526219e-07, + "loss": 0.0112, + "step": 32420 + }, + { + "epoch": 1.9038393800634026, + "grad_norm": 1.2342957258224487, + "learning_rate": 5.726599866303084e-07, + "loss": 0.005, + "step": 32430 + }, + { + "epoch": 1.9044264412351768, + "grad_norm": 1.5653058290481567, + "learning_rate": 5.657023196312394e-07, + "loss": 0.005, + "step": 32440 + }, + { + "epoch": 1.9050135024069508, + "grad_norm": 0.22834111750125885, + "learning_rate": 5.587869378068711e-07, + "loss": 0.0074, + "step": 32450 + }, + { + "epoch": 1.9056005635787248, + "grad_norm": 1.2102018594741821, + "learning_rate": 5.519138470724938e-07, + "loss": 0.0196, + "step": 32460 + }, + { + "epoch": 1.906187624750499, + "grad_norm": 0.04645948112010956, + "learning_rate": 5.450830533072271e-07, + "loss": 0.0037, + "step": 32470 + }, + { + "epoch": 1.9067746859222732, + "grad_norm": 0.4139319062232971, + "learning_rate": 5.38294562353997e-07, + "loss": 0.0063, + "step": 32480 + }, + { + "epoch": 1.9073617470940472, + "grad_norm": 1.47207772731781, + "learning_rate": 5.315483800195531e-07, + "loss": 0.0055, + "step": 32490 + }, + { + "epoch": 1.9079488082658211, + "grad_norm": 3.1884827613830566, + "learning_rate": 5.248445120744516e-07, + "loss": 0.0332, + "step": 32500 + }, + { + "epoch": 1.9085358694375953, + "grad_norm": 0.4252760708332062, + "learning_rate": 5.181829642530667e-07, + "loss": 0.0095, + "step": 32510 + }, + { + "epoch": 1.9091229306093696, + "grad_norm": 1.024160623550415, + "learning_rate": 5.115637422535513e-07, + "loss": 0.005, + "step": 32520 + }, + { + "epoch": 1.9097099917811438, + "grad_norm": 1.1204752922058105, + "learning_rate": 5.049868517378653e-07, + "loss": 0.0119, + "step": 32530 + }, + { + "epoch": 1.9102970529529177, + "grad_norm": 0.904366135597229, + "learning_rate": 4.984522983317641e-07, + "loss": 0.0161, + "step": 32540 + }, + { + "epoch": 1.9108841141246917, + "grad_norm": 0.027965368703007698, + "learning_rate": 4.919600876247709e-07, + "loss": 0.0086, + "step": 32550 + }, + { + "epoch": 1.911471175296466, + "grad_norm": 1.2174148559570312, + "learning_rate": 4.855102251702159e-07, + "loss": 0.0053, + "step": 32560 + }, + { + "epoch": 1.9120582364682401, + "grad_norm": 0.274185448884964, + "learning_rate": 4.791027164851803e-07, + "loss": 0.0091, + "step": 32570 + }, + { + "epoch": 1.9126452976400141, + "grad_norm": 0.632271409034729, + "learning_rate": 4.727375670505352e-07, + "loss": 0.0193, + "step": 32580 + }, + { + "epoch": 1.913232358811788, + "grad_norm": 0.3477608263492584, + "learning_rate": 4.6641478231090327e-07, + "loss": 0.008, + "step": 32590 + }, + { + "epoch": 1.9138194199835623, + "grad_norm": 0.49516561627388, + "learning_rate": 4.6013436767468053e-07, + "loss": 0.0051, + "step": 32600 + }, + { + "epoch": 1.9144064811553365, + "grad_norm": 2.593196392059326, + "learning_rate": 4.538963285140141e-07, + "loss": 0.0116, + "step": 32610 + }, + { + "epoch": 1.9149935423271105, + "grad_norm": 0.1109221875667572, + "learning_rate": 4.477006701648079e-07, + "loss": 0.0056, + "step": 32620 + }, + { + "epoch": 1.9155806034988845, + "grad_norm": 0.20114627480506897, + "learning_rate": 4.4154739792670594e-07, + "loss": 0.0055, + "step": 32630 + }, + { + "epoch": 1.9161676646706587, + "grad_norm": 2.1865549087524414, + "learning_rate": 4.3543651706312026e-07, + "loss": 0.0217, + "step": 32640 + }, + { + "epoch": 1.916754725842433, + "grad_norm": 2.8717215061187744, + "learning_rate": 4.29368032801164e-07, + "loss": 0.0127, + "step": 32650 + }, + { + "epoch": 1.9173417870142069, + "grad_norm": 0.0360056534409523, + "learning_rate": 4.233419503317182e-07, + "loss": 0.0053, + "step": 32660 + }, + { + "epoch": 1.9179288481859809, + "grad_norm": 0.5166193246841431, + "learning_rate": 4.1735827480937075e-07, + "loss": 0.0122, + "step": 32670 + }, + { + "epoch": 1.918515909357755, + "grad_norm": 0.6271770000457764, + "learning_rate": 4.114170113524496e-07, + "loss": 0.0115, + "step": 32680 + }, + { + "epoch": 1.9191029705295293, + "grad_norm": 0.7508412003517151, + "learning_rate": 4.055181650430062e-07, + "loss": 0.009, + "step": 32690 + }, + { + "epoch": 1.9196900317013033, + "grad_norm": 0.10411766171455383, + "learning_rate": 3.996617409268044e-07, + "loss": 0.0024, + "step": 32700 + }, + { + "epoch": 1.9202770928730772, + "grad_norm": 1.5734776258468628, + "learning_rate": 3.9384774401330924e-07, + "loss": 0.0068, + "step": 32710 + }, + { + "epoch": 1.9208641540448514, + "grad_norm": 2.1649329662323, + "learning_rate": 3.880761792757148e-07, + "loss": 0.0201, + "step": 32720 + }, + { + "epoch": 1.9214512152166257, + "grad_norm": 0.21282173693180084, + "learning_rate": 3.823470516508998e-07, + "loss": 0.0051, + "step": 32730 + }, + { + "epoch": 1.9220382763883996, + "grad_norm": 2.166048288345337, + "learning_rate": 3.766603660394663e-07, + "loss": 0.0073, + "step": 32740 + }, + { + "epoch": 1.9226253375601736, + "grad_norm": 3.5161662101745605, + "learning_rate": 3.7101612730569004e-07, + "loss": 0.0165, + "step": 32750 + }, + { + "epoch": 1.9232123987319478, + "grad_norm": 1.045634388923645, + "learning_rate": 3.654143402775478e-07, + "loss": 0.0428, + "step": 32760 + }, + { + "epoch": 1.923799459903722, + "grad_norm": 0.6151355504989624, + "learning_rate": 3.598550097467068e-07, + "loss": 0.0121, + "step": 32770 + }, + { + "epoch": 1.924386521075496, + "grad_norm": 0.0393298976123333, + "learning_rate": 3.543381404685131e-07, + "loss": 0.0062, + "step": 32780 + }, + { + "epoch": 1.9249735822472702, + "grad_norm": 0.01743028312921524, + "learning_rate": 3.4886373716199184e-07, + "loss": 0.004, + "step": 32790 + }, + { + "epoch": 1.9255606434190442, + "grad_norm": 0.00625281548127532, + "learning_rate": 3.434318045098417e-07, + "loss": 0.0077, + "step": 32800 + }, + { + "epoch": 1.9261477045908184, + "grad_norm": 0.21253876388072968, + "learning_rate": 3.380423471584515e-07, + "loss": 0.0055, + "step": 32810 + }, + { + "epoch": 1.9267347657625926, + "grad_norm": 1.02682363986969, + "learning_rate": 3.3269536971784474e-07, + "loss": 0.0161, + "step": 32820 + }, + { + "epoch": 1.9273218269343666, + "grad_norm": 0.9867899417877197, + "learning_rate": 3.2739087676173506e-07, + "loss": 0.0134, + "step": 32830 + }, + { + "epoch": 1.9279088881061406, + "grad_norm": 0.4015611708164215, + "learning_rate": 3.2212887282748737e-07, + "loss": 0.0141, + "step": 32840 + }, + { + "epoch": 1.9284959492779148, + "grad_norm": 0.18810132145881653, + "learning_rate": 3.169093624161179e-07, + "loss": 0.0056, + "step": 32850 + }, + { + "epoch": 1.929083010449689, + "grad_norm": 0.024899575859308243, + "learning_rate": 3.1173234999229973e-07, + "loss": 0.009, + "step": 32860 + }, + { + "epoch": 1.929670071621463, + "grad_norm": 0.06882507354021072, + "learning_rate": 3.0659783998435165e-07, + "loss": 0.0177, + "step": 32870 + }, + { + "epoch": 1.930257132793237, + "grad_norm": 0.6683951616287231, + "learning_rate": 3.0150583678423825e-07, + "loss": 0.0113, + "step": 32880 + }, + { + "epoch": 1.9308441939650112, + "grad_norm": 0.11474218219518661, + "learning_rate": 2.9645634474756435e-07, + "loss": 0.0071, + "step": 32890 + }, + { + "epoch": 1.9314312551367854, + "grad_norm": 0.10880117863416672, + "learning_rate": 2.914493681935693e-07, + "loss": 0.0058, + "step": 32900 + }, + { + "epoch": 1.9320183163085594, + "grad_norm": 0.4817119240760803, + "learning_rate": 2.8648491140513266e-07, + "loss": 0.0046, + "step": 32910 + }, + { + "epoch": 1.9326053774803333, + "grad_norm": 0.23212800920009613, + "learning_rate": 2.815629786287577e-07, + "loss": 0.0118, + "step": 32920 + }, + { + "epoch": 1.9331924386521075, + "grad_norm": 0.7403205037117004, + "learning_rate": 2.766835740745599e-07, + "loss": 0.0082, + "step": 32930 + }, + { + "epoch": 1.9337794998238818, + "grad_norm": 0.22013318538665771, + "learning_rate": 2.718467019163118e-07, + "loss": 0.0074, + "step": 32940 + }, + { + "epoch": 1.9343665609956557, + "grad_norm": 1.4403728246688843, + "learning_rate": 2.670523662913649e-07, + "loss": 0.008, + "step": 32950 + }, + { + "epoch": 1.9349536221674297, + "grad_norm": 0.07458072155714035, + "learning_rate": 2.623005713007165e-07, + "loss": 0.0063, + "step": 32960 + }, + { + "epoch": 1.935540683339204, + "grad_norm": 1.189927339553833, + "learning_rate": 2.5759132100895975e-07, + "loss": 0.0115, + "step": 32970 + }, + { + "epoch": 1.9361277445109781, + "grad_norm": 0.8814135193824768, + "learning_rate": 2.529246194443002e-07, + "loss": 0.0055, + "step": 32980 + }, + { + "epoch": 1.9367148056827521, + "grad_norm": 0.20815473794937134, + "learning_rate": 2.4830047059853924e-07, + "loss": 0.0141, + "step": 32990 + }, + { + "epoch": 1.937301866854526, + "grad_norm": 1.3255386352539062, + "learning_rate": 2.4371887842709606e-07, + "loss": 0.0047, + "step": 33000 + }, + { + "epoch": 1.937301866854526, + "eval_loss": 0.5217077732086182, + "eval_runtime": 269.5892, + "eval_samples_per_second": 3.505, + "eval_steps_per_second": 3.505, + "step": 33000 + }, + { + "epoch": 1.9378889280263003, + "grad_norm": 0.8180147409439087, + "learning_rate": 2.391798468489803e-07, + "loss": 0.0045, + "step": 33010 + }, + { + "epoch": 1.9384759891980745, + "grad_norm": 0.17384904623031616, + "learning_rate": 2.3468337974678624e-07, + "loss": 0.0173, + "step": 33020 + }, + { + "epoch": 1.9390630503698485, + "grad_norm": 0.11741580069065094, + "learning_rate": 2.3022948096672049e-07, + "loss": 0.0051, + "step": 33030 + }, + { + "epoch": 1.9396501115416225, + "grad_norm": 0.29323628544807434, + "learning_rate": 2.258181543185467e-07, + "loss": 0.0137, + "step": 33040 + }, + { + "epoch": 1.9402371727133967, + "grad_norm": 0.5180500745773315, + "learning_rate": 2.2144940357565203e-07, + "loss": 0.0094, + "step": 33050 + }, + { + "epoch": 1.940824233885171, + "grad_norm": 0.010030320845544338, + "learning_rate": 2.1712323247496946e-07, + "loss": 0.0012, + "step": 33060 + }, + { + "epoch": 1.941411295056945, + "grad_norm": 0.08542950451374054, + "learning_rate": 2.1283964471703332e-07, + "loss": 0.0043, + "step": 33070 + }, + { + "epoch": 1.941998356228719, + "grad_norm": 0.8036894798278809, + "learning_rate": 2.0859864396593488e-07, + "loss": 0.0155, + "step": 33080 + }, + { + "epoch": 1.942585417400493, + "grad_norm": 2.722127914428711, + "learning_rate": 2.044002338493556e-07, + "loss": 0.0113, + "step": 33090 + }, + { + "epoch": 1.9431724785722673, + "grad_norm": 0.0801001489162445, + "learning_rate": 2.0024441795853388e-07, + "loss": 0.0035, + "step": 33100 + }, + { + "epoch": 1.9437595397440415, + "grad_norm": 1.31703519821167, + "learning_rate": 1.961311998482762e-07, + "loss": 0.0126, + "step": 33110 + }, + { + "epoch": 1.9443466009158155, + "grad_norm": 0.5500128865242004, + "learning_rate": 1.9206058303695706e-07, + "loss": 0.0056, + "step": 33120 + }, + { + "epoch": 1.9449336620875894, + "grad_norm": 0.6608492136001587, + "learning_rate": 1.8803257100649675e-07, + "loss": 0.021, + "step": 33130 + }, + { + "epoch": 1.9455207232593636, + "grad_norm": 1.5913540124893188, + "learning_rate": 1.840471672023947e-07, + "loss": 0.0244, + "step": 33140 + }, + { + "epoch": 1.9461077844311379, + "grad_norm": 0.25986772775650024, + "learning_rate": 1.801043750336795e-07, + "loss": 0.0091, + "step": 33150 + }, + { + "epoch": 1.9466948456029118, + "grad_norm": 0.010232365690171719, + "learning_rate": 1.7620419787294785e-07, + "loss": 0.0074, + "step": 33160 + }, + { + "epoch": 1.9472819067746858, + "grad_norm": 2.475867509841919, + "learning_rate": 1.723466390563311e-07, + "loss": 0.0249, + "step": 33170 + }, + { + "epoch": 1.94786896794646, + "grad_norm": 2.9302568435668945, + "learning_rate": 1.6853170188352306e-07, + "loss": 0.0077, + "step": 33180 + }, + { + "epoch": 1.9484560291182342, + "grad_norm": 0.9175553321838379, + "learning_rate": 1.6475938961774683e-07, + "loss": 0.0048, + "step": 33190 + }, + { + "epoch": 1.9490430902900082, + "grad_norm": 1.592864990234375, + "learning_rate": 1.610297054857657e-07, + "loss": 0.0061, + "step": 33200 + }, + { + "epoch": 1.9496301514617822, + "grad_norm": 0.8533622622489929, + "learning_rate": 1.5734265267787763e-07, + "loss": 0.0179, + "step": 33210 + }, + { + "epoch": 1.9502172126335564, + "grad_norm": 2.1654791831970215, + "learning_rate": 1.5369823434792652e-07, + "loss": 0.0068, + "step": 33220 + }, + { + "epoch": 1.9508042738053306, + "grad_norm": 0.7100071310997009, + "learning_rate": 1.5009645361327983e-07, + "loss": 0.0105, + "step": 33230 + }, + { + "epoch": 1.9513913349771046, + "grad_norm": 0.04918665811419487, + "learning_rate": 1.465373135548287e-07, + "loss": 0.0081, + "step": 33240 + }, + { + "epoch": 1.9519783961488786, + "grad_norm": 0.3652174770832062, + "learning_rate": 1.4302081721699334e-07, + "loss": 0.0035, + "step": 33250 + }, + { + "epoch": 1.9525654573206528, + "grad_norm": 0.0118443313986063, + "learning_rate": 1.3954696760772323e-07, + "loss": 0.0059, + "step": 33260 + }, + { + "epoch": 1.953152518492427, + "grad_norm": 0.4527686834335327, + "learning_rate": 1.3611576769848034e-07, + "loss": 0.0133, + "step": 33270 + }, + { + "epoch": 1.953739579664201, + "grad_norm": 0.05000549182295799, + "learning_rate": 1.3272722042425577e-07, + "loss": 0.0113, + "step": 33280 + }, + { + "epoch": 1.954326640835975, + "grad_norm": 0.4897474944591522, + "learning_rate": 1.2938132868354768e-07, + "loss": 0.0089, + "step": 33290 + }, + { + "epoch": 1.9549137020077492, + "grad_norm": 0.08548981696367264, + "learning_rate": 1.2607809533836669e-07, + "loss": 0.0053, + "step": 33300 + }, + { + "epoch": 1.9555007631795234, + "grad_norm": 0.01599929668009281, + "learning_rate": 1.2281752321423589e-07, + "loss": 0.0104, + "step": 33310 + }, + { + "epoch": 1.9560878243512974, + "grad_norm": 0.350115031003952, + "learning_rate": 1.1959961510018546e-07, + "loss": 0.0083, + "step": 33320 + }, + { + "epoch": 1.9566748855230713, + "grad_norm": 1.249320387840271, + "learning_rate": 1.1642437374876913e-07, + "loss": 0.0139, + "step": 33330 + }, + { + "epoch": 1.9572619466948455, + "grad_norm": 0.07248983532190323, + "learning_rate": 1.1329180187600874e-07, + "loss": 0.0046, + "step": 33340 + }, + { + "epoch": 1.9578490078666198, + "grad_norm": 0.5684255361557007, + "learning_rate": 1.1020190216146086e-07, + "loss": 0.0042, + "step": 33350 + }, + { + "epoch": 1.958436069038394, + "grad_norm": 1.957879900932312, + "learning_rate": 1.071546772481613e-07, + "loss": 0.0054, + "step": 33360 + }, + { + "epoch": 1.959023130210168, + "grad_norm": 0.04956020042300224, + "learning_rate": 1.0415012974265281e-07, + "loss": 0.0033, + "step": 33370 + }, + { + "epoch": 1.959610191381942, + "grad_norm": 0.04298626631498337, + "learning_rate": 1.0118826221497401e-07, + "loss": 0.0118, + "step": 33380 + }, + { + "epoch": 1.9601972525537161, + "grad_norm": 0.26160338521003723, + "learning_rate": 9.82690771986372e-08, + "loss": 0.0045, + "step": 33390 + }, + { + "epoch": 1.9607843137254903, + "grad_norm": 0.9826566576957703, + "learning_rate": 9.539257719067274e-08, + "loss": 0.0057, + "step": 33400 + }, + { + "epoch": 1.9613713748972643, + "grad_norm": 0.45765307545661926, + "learning_rate": 9.25587646515791e-08, + "loss": 0.0039, + "step": 33410 + }, + { + "epoch": 1.9619584360690383, + "grad_norm": 0.016645647585392, + "learning_rate": 8.976764200534504e-08, + "loss": 0.0173, + "step": 33420 + }, + { + "epoch": 1.9625454972408125, + "grad_norm": 0.41816627979278564, + "learning_rate": 8.701921163944415e-08, + "loss": 0.0132, + "step": 33430 + }, + { + "epoch": 1.9631325584125867, + "grad_norm": 0.17216813564300537, + "learning_rate": 8.431347590483474e-08, + "loss": 0.0094, + "step": 33440 + }, + { + "epoch": 1.9637196195843607, + "grad_norm": 1.3329750299453735, + "learning_rate": 8.165043711595987e-08, + "loss": 0.0156, + "step": 33450 + }, + { + "epoch": 1.9643066807561347, + "grad_norm": 0.669681966304779, + "learning_rate": 7.903009755071967e-08, + "loss": 0.0057, + "step": 33460 + }, + { + "epoch": 1.9648937419279089, + "grad_norm": 1.701711654663086, + "learning_rate": 7.645245945051005e-08, + "loss": 0.012, + "step": 33470 + }, + { + "epoch": 1.965480803099683, + "grad_norm": 0.06459162384271622, + "learning_rate": 7.391752502019512e-08, + "loss": 0.0029, + "step": 33480 + }, + { + "epoch": 1.966067864271457, + "grad_norm": 0.01970849744975567, + "learning_rate": 7.142529642810703e-08, + "loss": 0.0077, + "step": 33490 + }, + { + "epoch": 1.966654925443231, + "grad_norm": 0.1700696051120758, + "learning_rate": 6.897577580606273e-08, + "loss": 0.017, + "step": 33500 + }, + { + "epoch": 1.9672419866150053, + "grad_norm": 0.00029024932882748544, + "learning_rate": 6.656896524931955e-08, + "loss": 0.0058, + "step": 33510 + }, + { + "epoch": 1.9678290477867795, + "grad_norm": 1.4260274171829224, + "learning_rate": 6.420486681663062e-08, + "loss": 0.0051, + "step": 33520 + }, + { + "epoch": 1.9684161089585535, + "grad_norm": 0.2738833427429199, + "learning_rate": 6.188348253019505e-08, + "loss": 0.0147, + "step": 33530 + }, + { + "epoch": 1.9690031701303274, + "grad_norm": 0.014176618307828903, + "learning_rate": 5.960481437568555e-08, + "loss": 0.0044, + "step": 33540 + }, + { + "epoch": 1.9695902313021016, + "grad_norm": 1.1728260517120361, + "learning_rate": 5.7368864302226324e-08, + "loss": 0.0115, + "step": 33550 + }, + { + "epoch": 1.9701772924738759, + "grad_norm": 1.0428615808486938, + "learning_rate": 5.517563422241523e-08, + "loss": 0.0034, + "step": 33560 + }, + { + "epoch": 1.9707643536456498, + "grad_norm": 0.09201149642467499, + "learning_rate": 5.3025126012301586e-08, + "loss": 0.0054, + "step": 33570 + }, + { + "epoch": 1.9713514148174238, + "grad_norm": 0.08367258310317993, + "learning_rate": 5.091734151138061e-08, + "loss": 0.0045, + "step": 33580 + }, + { + "epoch": 1.971938475989198, + "grad_norm": 0.30681926012039185, + "learning_rate": 4.8852282522615646e-08, + "loss": 0.0025, + "step": 33590 + }, + { + "epoch": 1.9725255371609722, + "grad_norm": 0.10407868772745132, + "learning_rate": 4.6829950812421474e-08, + "loss": 0.0081, + "step": 33600 + }, + { + "epoch": 1.9731125983327464, + "grad_norm": 0.018392860889434814, + "learning_rate": 4.48503481106588e-08, + "loss": 0.0034, + "step": 33610 + }, + { + "epoch": 1.9736996595045204, + "grad_norm": 0.4253391921520233, + "learning_rate": 4.2913476110650887e-08, + "loss": 0.0051, + "step": 33620 + }, + { + "epoch": 1.9742867206762944, + "grad_norm": 0.7737009525299072, + "learning_rate": 4.101933646915024e-08, + "loss": 0.0093, + "step": 33630 + }, + { + "epoch": 1.9748737818480686, + "grad_norm": 2.204073429107666, + "learning_rate": 3.9167930806377485e-08, + "loss": 0.0109, + "step": 33640 + }, + { + "epoch": 1.9754608430198428, + "grad_norm": 0.09515777230262756, + "learning_rate": 3.7359260705993604e-08, + "loss": 0.001, + "step": 33650 + }, + { + "epoch": 1.9760479041916168, + "grad_norm": 0.03296930715441704, + "learning_rate": 3.559332771508883e-08, + "loss": 0.004, + "step": 33660 + }, + { + "epoch": 1.9766349653633908, + "grad_norm": 0.04820576310157776, + "learning_rate": 3.387013334421596e-08, + "loss": 0.0018, + "step": 33670 + }, + { + "epoch": 1.977222026535165, + "grad_norm": 0.07194074988365173, + "learning_rate": 3.2189679067368136e-08, + "loss": 0.0103, + "step": 33680 + }, + { + "epoch": 1.9778090877069392, + "grad_norm": 0.06059109419584274, + "learning_rate": 3.055196632196222e-08, + "loss": 0.0052, + "step": 33690 + }, + { + "epoch": 1.9783961488787132, + "grad_norm": 0.038826677948236465, + "learning_rate": 2.8956996508883172e-08, + "loss": 0.0078, + "step": 33700 + }, + { + "epoch": 1.9789832100504872, + "grad_norm": 0.8903669118881226, + "learning_rate": 2.7404770992423002e-08, + "loss": 0.0051, + "step": 33710 + }, + { + "epoch": 1.9795702712222614, + "grad_norm": 0.08376505970954895, + "learning_rate": 2.5895291100336282e-08, + "loss": 0.0159, + "step": 33720 + }, + { + "epoch": 1.9801573323940356, + "grad_norm": 1.5218383073806763, + "learning_rate": 2.4428558123795743e-08, + "loss": 0.011, + "step": 33730 + }, + { + "epoch": 1.9807443935658096, + "grad_norm": 1.9102550745010376, + "learning_rate": 2.3004573317431112e-08, + "loss": 0.0141, + "step": 33740 + }, + { + "epoch": 1.9813314547375835, + "grad_norm": 0.3464989960193634, + "learning_rate": 2.1623337899279173e-08, + "loss": 0.0045, + "step": 33750 + }, + { + "epoch": 1.9819185159093577, + "grad_norm": 2.3063037395477295, + "learning_rate": 2.0284853050828166e-08, + "loss": 0.0075, + "step": 33760 + }, + { + "epoch": 1.982505577081132, + "grad_norm": 0.6410544514656067, + "learning_rate": 1.898911991699004e-08, + "loss": 0.0098, + "step": 33770 + }, + { + "epoch": 1.983092638252906, + "grad_norm": 0.12403683364391327, + "learning_rate": 1.7736139606111534e-08, + "loss": 0.009, + "step": 33780 + }, + { + "epoch": 1.98367969942468, + "grad_norm": 0.5402272939682007, + "learning_rate": 1.6525913189974208e-08, + "loss": 0.0042, + "step": 33790 + }, + { + "epoch": 1.9842667605964541, + "grad_norm": 0.9701083898544312, + "learning_rate": 1.5358441703777758e-08, + "loss": 0.0221, + "step": 33800 + }, + { + "epoch": 1.9848538217682283, + "grad_norm": 0.9332861304283142, + "learning_rate": 1.42337261461567e-08, + "loss": 0.0076, + "step": 33810 + }, + { + "epoch": 1.9854408829400023, + "grad_norm": 0.9768741726875305, + "learning_rate": 1.3151767479169241e-08, + "loss": 0.0121, + "step": 33820 + }, + { + "epoch": 1.9860279441117763, + "grad_norm": 0.5883607864379883, + "learning_rate": 1.2112566628302846e-08, + "loss": 0.0124, + "step": 33830 + }, + { + "epoch": 1.9866150052835505, + "grad_norm": 0.31778550148010254, + "learning_rate": 1.1116124482479784e-08, + "loss": 0.0088, + "step": 33840 + }, + { + "epoch": 1.9872020664553247, + "grad_norm": 0.05427484214305878, + "learning_rate": 1.0162441894023822e-08, + "loss": 0.0152, + "step": 33850 + }, + { + "epoch": 1.9877891276270987, + "grad_norm": 0.04062940552830696, + "learning_rate": 9.251519678710186e-09, + "loss": 0.0044, + "step": 33860 + }, + { + "epoch": 1.9883761887988727, + "grad_norm": 0.5054002404212952, + "learning_rate": 8.383358615715598e-09, + "loss": 0.0101, + "step": 33870 + }, + { + "epoch": 1.9889632499706469, + "grad_norm": 0.013953852467238903, + "learning_rate": 7.557959447657137e-09, + "loss": 0.0291, + "step": 33880 + }, + { + "epoch": 1.989550311142421, + "grad_norm": 0.09007053822278976, + "learning_rate": 6.775322880553381e-09, + "loss": 0.0073, + "step": 33890 + }, + { + "epoch": 1.9901373723141953, + "grad_norm": 2.0966174602508545, + "learning_rate": 6.035449583868813e-09, + "loss": 0.0091, + "step": 33900 + }, + { + "epoch": 1.9907244334859693, + "grad_norm": 0.2234669178724289, + "learning_rate": 5.338340190469415e-09, + "loss": 0.0063, + "step": 33910 + }, + { + "epoch": 1.9913114946577433, + "grad_norm": 1.178725242614746, + "learning_rate": 4.6839952966559744e-09, + "loss": 0.0087, + "step": 33920 + }, + { + "epoch": 1.9918985558295175, + "grad_norm": 0.22748713195323944, + "learning_rate": 4.0724154621418766e-09, + "loss": 0.014, + "step": 33930 + }, + { + "epoch": 1.9924856170012917, + "grad_norm": 0.9043611884117126, + "learning_rate": 3.503601210053109e-09, + "loss": 0.0053, + "step": 33940 + }, + { + "epoch": 1.9930726781730657, + "grad_norm": 0.05220215767621994, + "learning_rate": 2.9775530269560146e-09, + "loss": 0.006, + "step": 33950 + }, + { + "epoch": 1.9936597393448396, + "grad_norm": 0.29552602767944336, + "learning_rate": 2.494271362807332e-09, + "loss": 0.0067, + "step": 33960 + }, + { + "epoch": 1.9942468005166138, + "grad_norm": 0.5266906023025513, + "learning_rate": 2.0537566310097065e-09, + "loss": 0.0043, + "step": 33970 + }, + { + "epoch": 1.994833861688388, + "grad_norm": 0.13338631391525269, + "learning_rate": 1.6560092083672817e-09, + "loss": 0.0062, + "step": 33980 + }, + { + "epoch": 1.995420922860162, + "grad_norm": 0.04648716747760773, + "learning_rate": 1.3010294351023523e-09, + "loss": 0.0031, + "step": 33990 + }, + { + "epoch": 1.996007984031936, + "grad_norm": 0.14625805616378784, + "learning_rate": 9.88817614860915e-10, + "loss": 0.0054, + "step": 34000 + }, + { + "epoch": 1.9965950452037102, + "grad_norm": 0.029483526945114136, + "learning_rate": 7.193740147015682e-10, + "loss": 0.0043, + "step": 34010 + }, + { + "epoch": 1.9971821063754844, + "grad_norm": 5.174992561340332, + "learning_rate": 4.926988651066111e-10, + "loss": 0.0135, + "step": 34020 + }, + { + "epoch": 1.9977691675472584, + "grad_norm": 0.21607564389705658, + "learning_rate": 3.087923599598419e-10, + "loss": 0.002, + "step": 34030 + }, + { + "epoch": 1.9983562287190324, + "grad_norm": 0.7728718519210815, + "learning_rate": 1.6765465658541424e-10, + "loss": 0.0118, + "step": 34040 + }, + { + "epoch": 1.9989432898908066, + "grad_norm": 0.17154139280319214, + "learning_rate": 6.928587569232647e-11, + "loss": 0.0162, + "step": 34050 + }, + { + "epoch": 1.9995303510625808, + "grad_norm": 1.6425248384475708, + "learning_rate": 1.3686101441034992e-11, + "loss": 0.006, + "step": 34060 + }, + { + "epoch": 2.0, + "step": 34068, + "total_flos": 4.396799291960525e+17, + "train_loss": 0.02699765918700505, + "train_runtime": 22406.0806, + "train_samples_per_second": 1.52, + "train_steps_per_second": 1.52 + } + ], + "logging_steps": 10, + "max_steps": 34068, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.396799291960525e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}