{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 8.372652927706902, "eval_steps": 1000, "global_step": 32000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0026169447170428526, "grad_norm": 0.5513588786125183, "learning_rate": 2.25e-07, "loss": 0.051, "step": 10 }, { "epoch": 0.005233889434085705, "grad_norm": 0.45345941185951233, "learning_rate": 4.75e-07, "loss": 0.0552, "step": 20 }, { "epoch": 0.007850834151128557, "grad_norm": 0.5479740500450134, "learning_rate": 7.25e-07, "loss": 0.0607, "step": 30 }, { "epoch": 0.01046777886817141, "grad_norm": 0.36328455805778503, "learning_rate": 9.75e-07, "loss": 0.0478, "step": 40 }, { "epoch": 0.013084723585214262, "grad_norm": 0.6090478301048279, "learning_rate": 1.2250000000000001e-06, "loss": 0.05, "step": 50 }, { "epoch": 0.015701668302257114, "grad_norm": 0.4246404469013214, "learning_rate": 1.475e-06, "loss": 0.0514, "step": 60 }, { "epoch": 0.01831861301929997, "grad_norm": 0.3821658194065094, "learning_rate": 1.7250000000000002e-06, "loss": 0.0576, "step": 70 }, { "epoch": 0.02093555773634282, "grad_norm": 0.4736561179161072, "learning_rate": 1.975e-06, "loss": 0.0503, "step": 80 }, { "epoch": 0.023552502453385672, "grad_norm": 0.3493232727050781, "learning_rate": 2.225e-06, "loss": 0.0521, "step": 90 }, { "epoch": 0.026169447170428524, "grad_norm": 0.3130541145801544, "learning_rate": 2.4750000000000004e-06, "loss": 0.049, "step": 100 }, { "epoch": 0.028786391887471376, "grad_norm": 0.40587979555130005, "learning_rate": 2.725e-06, "loss": 0.0569, "step": 110 }, { "epoch": 0.03140333660451423, "grad_norm": 0.2859586179256439, "learning_rate": 2.975e-06, "loss": 0.0504, "step": 120 }, { "epoch": 0.03402028132155708, "grad_norm": 0.38783594965934753, "learning_rate": 3.225e-06, "loss": 0.0447, "step": 130 }, { "epoch": 0.03663722603859994, "grad_norm": 0.24415196478366852, "learning_rate": 3.4750000000000006e-06, "loss": 0.0534, "step": 140 }, { "epoch": 0.03925417075564279, "grad_norm": 0.4478183388710022, "learning_rate": 3.725e-06, "loss": 0.057, "step": 150 }, { "epoch": 0.04187111547268564, "grad_norm": 0.4929760694503784, "learning_rate": 3.975e-06, "loss": 0.0522, "step": 160 }, { "epoch": 0.04448806018972849, "grad_norm": 0.4541402757167816, "learning_rate": 4.225e-06, "loss": 0.0484, "step": 170 }, { "epoch": 0.047105004906771344, "grad_norm": 0.34477248787879944, "learning_rate": 4.475e-06, "loss": 0.044, "step": 180 }, { "epoch": 0.049721949623814196, "grad_norm": 0.4181000292301178, "learning_rate": 4.7250000000000005e-06, "loss": 0.0489, "step": 190 }, { "epoch": 0.05233889434085705, "grad_norm": 0.38417401909828186, "learning_rate": 4.975000000000001e-06, "loss": 0.0479, "step": 200 }, { "epoch": 0.0549558390578999, "grad_norm": 0.34942829608917236, "learning_rate": 5.225e-06, "loss": 0.0481, "step": 210 }, { "epoch": 0.05757278377494275, "grad_norm": 0.5058625936508179, "learning_rate": 5.475e-06, "loss": 0.0508, "step": 220 }, { "epoch": 0.06018972849198561, "grad_norm": 0.39895838499069214, "learning_rate": 5.725e-06, "loss": 0.0518, "step": 230 }, { "epoch": 0.06280667320902845, "grad_norm": 0.32901522517204285, "learning_rate": 5.975e-06, "loss": 0.0552, "step": 240 }, { "epoch": 0.0654236179260713, "grad_norm": 0.3974279761314392, "learning_rate": 6.2250000000000005e-06, "loss": 0.0468, "step": 250 }, { "epoch": 0.06804056264311416, "grad_norm": 0.4717833697795868, "learning_rate": 6.475000000000001e-06, "loss": 0.05, "step": 260 }, { "epoch": 0.07065750736015702, "grad_norm": 0.3713611960411072, "learning_rate": 6.725000000000001e-06, "loss": 0.0486, "step": 270 }, { "epoch": 0.07327445207719988, "grad_norm": 0.3582926094532013, "learning_rate": 6.975000000000001e-06, "loss": 0.0451, "step": 280 }, { "epoch": 0.07589139679424273, "grad_norm": 0.3125925660133362, "learning_rate": 7.2249999999999994e-06, "loss": 0.0463, "step": 290 }, { "epoch": 0.07850834151128558, "grad_norm": 0.33565473556518555, "learning_rate": 7.4750000000000004e-06, "loss": 0.0512, "step": 300 }, { "epoch": 0.08112528622832843, "grad_norm": 0.6160638928413391, "learning_rate": 7.725e-06, "loss": 0.0529, "step": 310 }, { "epoch": 0.08374223094537128, "grad_norm": 0.3819540739059448, "learning_rate": 7.975e-06, "loss": 0.0417, "step": 320 }, { "epoch": 0.08635917566241413, "grad_norm": 0.34822750091552734, "learning_rate": 8.225e-06, "loss": 0.0464, "step": 330 }, { "epoch": 0.08897612037945699, "grad_norm": 0.47799152135849, "learning_rate": 8.475000000000001e-06, "loss": 0.0559, "step": 340 }, { "epoch": 0.09159306509649984, "grad_norm": 0.33403629064559937, "learning_rate": 8.725e-06, "loss": 0.0473, "step": 350 }, { "epoch": 0.09421000981354269, "grad_norm": 0.4290122091770172, "learning_rate": 8.975e-06, "loss": 0.0512, "step": 360 }, { "epoch": 0.09682695453058554, "grad_norm": 0.46314916014671326, "learning_rate": 9.225e-06, "loss": 0.0451, "step": 370 }, { "epoch": 0.09944389924762839, "grad_norm": 0.5091361403465271, "learning_rate": 9.475e-06, "loss": 0.0451, "step": 380 }, { "epoch": 0.10206084396467124, "grad_norm": 0.4810822308063507, "learning_rate": 9.725000000000001e-06, "loss": 0.0435, "step": 390 }, { "epoch": 0.1046777886817141, "grad_norm": 0.5177111625671387, "learning_rate": 9.975e-06, "loss": 0.0476, "step": 400 }, { "epoch": 0.10729473339875695, "grad_norm": 0.4763142764568329, "learning_rate": 1.0225e-05, "loss": 0.0493, "step": 410 }, { "epoch": 0.1099116781157998, "grad_norm": 0.4892769753932953, "learning_rate": 1.0475e-05, "loss": 0.048, "step": 420 }, { "epoch": 0.11252862283284265, "grad_norm": 0.41679662466049194, "learning_rate": 1.0725e-05, "loss": 0.052, "step": 430 }, { "epoch": 0.1151455675498855, "grad_norm": 0.5526682734489441, "learning_rate": 1.0975e-05, "loss": 0.046, "step": 440 }, { "epoch": 0.11776251226692837, "grad_norm": 0.46023502945899963, "learning_rate": 1.1225e-05, "loss": 0.0478, "step": 450 }, { "epoch": 0.12037945698397122, "grad_norm": 0.41873809695243835, "learning_rate": 1.1475000000000001e-05, "loss": 0.0439, "step": 460 }, { "epoch": 0.12299640170101407, "grad_norm": 0.3857043981552124, "learning_rate": 1.1725e-05, "loss": 0.0409, "step": 470 }, { "epoch": 0.1256133464180569, "grad_norm": 0.4027094841003418, "learning_rate": 1.1975e-05, "loss": 0.045, "step": 480 }, { "epoch": 0.12823029113509976, "grad_norm": 0.45145806670188904, "learning_rate": 1.2225e-05, "loss": 0.0453, "step": 490 }, { "epoch": 0.1308472358521426, "grad_norm": 0.3627549409866333, "learning_rate": 1.2475e-05, "loss": 0.045, "step": 500 }, { "epoch": 0.13346418056918546, "grad_norm": 0.4891456067562103, "learning_rate": 1.2725000000000001e-05, "loss": 0.0415, "step": 510 }, { "epoch": 0.13608112528622832, "grad_norm": 0.4617407023906708, "learning_rate": 1.2975e-05, "loss": 0.0454, "step": 520 }, { "epoch": 0.13869807000327117, "grad_norm": 0.5147842764854431, "learning_rate": 1.3225000000000001e-05, "loss": 0.047, "step": 530 }, { "epoch": 0.14131501472031405, "grad_norm": 0.47616279125213623, "learning_rate": 1.3475000000000002e-05, "loss": 0.0481, "step": 540 }, { "epoch": 0.1439319594373569, "grad_norm": 0.4826867878437042, "learning_rate": 1.3725000000000002e-05, "loss": 0.0381, "step": 550 }, { "epoch": 0.14654890415439975, "grad_norm": 0.4853934943675995, "learning_rate": 1.3975000000000003e-05, "loss": 0.0479, "step": 560 }, { "epoch": 0.1491658488714426, "grad_norm": 0.7997105121612549, "learning_rate": 1.4225e-05, "loss": 0.0519, "step": 570 }, { "epoch": 0.15178279358848545, "grad_norm": 0.352081298828125, "learning_rate": 1.4475e-05, "loss": 0.0469, "step": 580 }, { "epoch": 0.1543997383055283, "grad_norm": 0.5584960579872131, "learning_rate": 1.4725e-05, "loss": 0.045, "step": 590 }, { "epoch": 0.15701668302257116, "grad_norm": 0.5723614692687988, "learning_rate": 1.4975e-05, "loss": 0.0506, "step": 600 }, { "epoch": 0.159633627739614, "grad_norm": 0.608233630657196, "learning_rate": 1.5225e-05, "loss": 0.047, "step": 610 }, { "epoch": 0.16225057245665686, "grad_norm": 0.42129021883010864, "learning_rate": 1.5475e-05, "loss": 0.0462, "step": 620 }, { "epoch": 0.1648675171736997, "grad_norm": 0.2876960039138794, "learning_rate": 1.5725e-05, "loss": 0.0504, "step": 630 }, { "epoch": 0.16748446189074256, "grad_norm": 1.045732021331787, "learning_rate": 1.5975000000000002e-05, "loss": 0.0519, "step": 640 }, { "epoch": 0.17010140660778542, "grad_norm": 0.707153856754303, "learning_rate": 1.6225e-05, "loss": 0.0486, "step": 650 }, { "epoch": 0.17271835132482827, "grad_norm": 0.5502765774726868, "learning_rate": 1.6475e-05, "loss": 0.0515, "step": 660 }, { "epoch": 0.17533529604187112, "grad_norm": 0.6845604181289673, "learning_rate": 1.6725000000000003e-05, "loss": 0.0443, "step": 670 }, { "epoch": 0.17795224075891397, "grad_norm": 0.7459421157836914, "learning_rate": 1.6975000000000003e-05, "loss": 0.0428, "step": 680 }, { "epoch": 0.18056918547595682, "grad_norm": 0.4575180411338806, "learning_rate": 1.7225e-05, "loss": 0.042, "step": 690 }, { "epoch": 0.18318613019299967, "grad_norm": 0.8352943062782288, "learning_rate": 1.7475e-05, "loss": 0.0448, "step": 700 }, { "epoch": 0.18580307491004253, "grad_norm": 0.6489511728286743, "learning_rate": 1.7725e-05, "loss": 0.0463, "step": 710 }, { "epoch": 0.18842001962708538, "grad_norm": 0.5922821164131165, "learning_rate": 1.7975e-05, "loss": 0.0487, "step": 720 }, { "epoch": 0.19103696434412823, "grad_norm": 0.6881849765777588, "learning_rate": 1.8225e-05, "loss": 0.0492, "step": 730 }, { "epoch": 0.19365390906117108, "grad_norm": 0.6800899505615234, "learning_rate": 1.8475000000000002e-05, "loss": 0.0521, "step": 740 }, { "epoch": 0.19627085377821393, "grad_norm": 0.578558623790741, "learning_rate": 1.8725e-05, "loss": 0.0489, "step": 750 }, { "epoch": 0.19888779849525678, "grad_norm": 0.5541056990623474, "learning_rate": 1.8975e-05, "loss": 0.0487, "step": 760 }, { "epoch": 0.20150474321229964, "grad_norm": 0.6745200157165527, "learning_rate": 1.9225e-05, "loss": 0.0542, "step": 770 }, { "epoch": 0.2041216879293425, "grad_norm": 0.7663201093673706, "learning_rate": 1.9475000000000002e-05, "loss": 0.0502, "step": 780 }, { "epoch": 0.20673863264638534, "grad_norm": 0.5854843854904175, "learning_rate": 1.9725000000000002e-05, "loss": 0.0441, "step": 790 }, { "epoch": 0.2093555773634282, "grad_norm": 0.580001175403595, "learning_rate": 1.9975e-05, "loss": 0.0496, "step": 800 }, { "epoch": 0.21197252208047104, "grad_norm": 0.6295838356018066, "learning_rate": 2.0225000000000004e-05, "loss": 0.0415, "step": 810 }, { "epoch": 0.2145894667975139, "grad_norm": 0.656004786491394, "learning_rate": 2.0475e-05, "loss": 0.0472, "step": 820 }, { "epoch": 0.21720641151455675, "grad_norm": 0.5602745413780212, "learning_rate": 2.0725e-05, "loss": 0.0469, "step": 830 }, { "epoch": 0.2198233562315996, "grad_norm": 0.7688363790512085, "learning_rate": 2.0975e-05, "loss": 0.0503, "step": 840 }, { "epoch": 0.22244030094864245, "grad_norm": 0.5913244485855103, "learning_rate": 2.1225e-05, "loss": 0.0493, "step": 850 }, { "epoch": 0.2250572456656853, "grad_norm": 0.7088301777839661, "learning_rate": 2.1475e-05, "loss": 0.0498, "step": 860 }, { "epoch": 0.22767419038272815, "grad_norm": 0.5226425528526306, "learning_rate": 2.1725e-05, "loss": 0.0456, "step": 870 }, { "epoch": 0.230291135099771, "grad_norm": 0.5275906324386597, "learning_rate": 2.1975000000000002e-05, "loss": 0.0508, "step": 880 }, { "epoch": 0.23290807981681386, "grad_norm": 0.36874884366989136, "learning_rate": 2.2225e-05, "loss": 0.0506, "step": 890 }, { "epoch": 0.23552502453385674, "grad_norm": 0.44395896792411804, "learning_rate": 2.2475e-05, "loss": 0.0524, "step": 900 }, { "epoch": 0.2381419692508996, "grad_norm": 0.5883681178092957, "learning_rate": 2.2725000000000003e-05, "loss": 0.0465, "step": 910 }, { "epoch": 0.24075891396794244, "grad_norm": 0.48441359400749207, "learning_rate": 2.2975000000000003e-05, "loss": 0.0477, "step": 920 }, { "epoch": 0.2433758586849853, "grad_norm": 0.6230258345603943, "learning_rate": 2.3225000000000002e-05, "loss": 0.0535, "step": 930 }, { "epoch": 0.24599280340202814, "grad_norm": 0.7106624245643616, "learning_rate": 2.3475e-05, "loss": 0.0471, "step": 940 }, { "epoch": 0.248609748119071, "grad_norm": 0.7645530700683594, "learning_rate": 2.3725e-05, "loss": 0.0485, "step": 950 }, { "epoch": 0.2512266928361138, "grad_norm": 0.6872208714485168, "learning_rate": 2.3975e-05, "loss": 0.0516, "step": 960 }, { "epoch": 0.2538436375531567, "grad_norm": 0.705649733543396, "learning_rate": 2.4225e-05, "loss": 0.0535, "step": 970 }, { "epoch": 0.2564605822701995, "grad_norm": 0.533315896987915, "learning_rate": 2.4475000000000002e-05, "loss": 0.0469, "step": 980 }, { "epoch": 0.2590775269872424, "grad_norm": 0.5918316841125488, "learning_rate": 2.4725e-05, "loss": 0.0575, "step": 990 }, { "epoch": 0.2616944717042852, "grad_norm": 0.5549744963645935, "learning_rate": 2.4975e-05, "loss": 0.0475, "step": 1000 }, { "epoch": 0.2616944717042852, "eval_loss": 0.05487126120450602, "eval_runtime": 8.7333, "eval_samples_per_second": 117.252, "eval_steps_per_second": 1.832, "step": 1000 }, { "epoch": 0.2643114164213281, "grad_norm": 0.82178795337677, "learning_rate": 2.5225e-05, "loss": 0.0504, "step": 1010 }, { "epoch": 0.26692836113837093, "grad_norm": 0.7854476571083069, "learning_rate": 2.5475e-05, "loss": 0.0549, "step": 1020 }, { "epoch": 0.2695453058554138, "grad_norm": 0.43605050444602966, "learning_rate": 2.5725e-05, "loss": 0.0482, "step": 1030 }, { "epoch": 0.27216225057245663, "grad_norm": 0.4977242350578308, "learning_rate": 2.5974999999999998e-05, "loss": 0.0432, "step": 1040 }, { "epoch": 0.2747791952894995, "grad_norm": 0.5008284449577332, "learning_rate": 2.6225e-05, "loss": 0.0445, "step": 1050 }, { "epoch": 0.27739614000654234, "grad_norm": 0.5448734164237976, "learning_rate": 2.6475e-05, "loss": 0.0456, "step": 1060 }, { "epoch": 0.2800130847235852, "grad_norm": 0.6299842596054077, "learning_rate": 2.6725e-05, "loss": 0.0525, "step": 1070 }, { "epoch": 0.2826300294406281, "grad_norm": 0.5707956552505493, "learning_rate": 2.6975000000000002e-05, "loss": 0.0545, "step": 1080 }, { "epoch": 0.2852469741576709, "grad_norm": 0.9552313685417175, "learning_rate": 2.7225e-05, "loss": 0.0496, "step": 1090 }, { "epoch": 0.2878639188747138, "grad_norm": 0.6557886600494385, "learning_rate": 2.7475e-05, "loss": 0.0447, "step": 1100 }, { "epoch": 0.2904808635917566, "grad_norm": 0.6832493543624878, "learning_rate": 2.7725e-05, "loss": 0.0461, "step": 1110 }, { "epoch": 0.2930978083087995, "grad_norm": 0.38393524289131165, "learning_rate": 2.7975000000000002e-05, "loss": 0.0486, "step": 1120 }, { "epoch": 0.2957147530258423, "grad_norm": 0.894242525100708, "learning_rate": 2.8225e-05, "loss": 0.0555, "step": 1130 }, { "epoch": 0.2983316977428852, "grad_norm": 0.8547376990318298, "learning_rate": 2.8475e-05, "loss": 0.0515, "step": 1140 }, { "epoch": 0.30094864245992803, "grad_norm": 0.48917195200920105, "learning_rate": 2.8725e-05, "loss": 0.0479, "step": 1150 }, { "epoch": 0.3035655871769709, "grad_norm": 0.6105035543441772, "learning_rate": 2.8975000000000003e-05, "loss": 0.0527, "step": 1160 }, { "epoch": 0.30618253189401373, "grad_norm": 0.6663628816604614, "learning_rate": 2.9225000000000002e-05, "loss": 0.0526, "step": 1170 }, { "epoch": 0.3087994766110566, "grad_norm": 0.6550115942955017, "learning_rate": 2.9475e-05, "loss": 0.0506, "step": 1180 }, { "epoch": 0.31141642132809944, "grad_norm": 0.5986365675926208, "learning_rate": 2.9725000000000004e-05, "loss": 0.0459, "step": 1190 }, { "epoch": 0.3140333660451423, "grad_norm": 0.6375890970230103, "learning_rate": 2.9975000000000004e-05, "loss": 0.0543, "step": 1200 }, { "epoch": 0.31665031076218514, "grad_norm": 0.7755595445632935, "learning_rate": 3.0225000000000003e-05, "loss": 0.0466, "step": 1210 }, { "epoch": 0.319267255479228, "grad_norm": 0.8417050838470459, "learning_rate": 3.0475000000000002e-05, "loss": 0.0516, "step": 1220 }, { "epoch": 0.32188420019627084, "grad_norm": 0.6634261608123779, "learning_rate": 3.0725e-05, "loss": 0.0544, "step": 1230 }, { "epoch": 0.3245011449133137, "grad_norm": 0.8552030920982361, "learning_rate": 3.0975e-05, "loss": 0.0538, "step": 1240 }, { "epoch": 0.32711808963035655, "grad_norm": 0.6499819755554199, "learning_rate": 3.122500000000001e-05, "loss": 0.0507, "step": 1250 }, { "epoch": 0.3297350343473994, "grad_norm": 0.8389486074447632, "learning_rate": 3.1475e-05, "loss": 0.0513, "step": 1260 }, { "epoch": 0.33235197906444225, "grad_norm": 0.7545788288116455, "learning_rate": 3.1725e-05, "loss": 0.0508, "step": 1270 }, { "epoch": 0.33496892378148513, "grad_norm": 0.739088773727417, "learning_rate": 3.1975e-05, "loss": 0.0592, "step": 1280 }, { "epoch": 0.33758586849852795, "grad_norm": 0.629649817943573, "learning_rate": 3.2225e-05, "loss": 0.0537, "step": 1290 }, { "epoch": 0.34020281321557083, "grad_norm": 0.6471114158630371, "learning_rate": 3.2474999999999997e-05, "loss": 0.0569, "step": 1300 }, { "epoch": 0.34281975793261366, "grad_norm": 0.5596538782119751, "learning_rate": 3.2725e-05, "loss": 0.052, "step": 1310 }, { "epoch": 0.34543670264965654, "grad_norm": 0.46181052923202515, "learning_rate": 3.2975e-05, "loss": 0.0451, "step": 1320 }, { "epoch": 0.34805364736669936, "grad_norm": 0.6883523464202881, "learning_rate": 3.3225e-05, "loss": 0.0634, "step": 1330 }, { "epoch": 0.35067059208374224, "grad_norm": 0.7209715247154236, "learning_rate": 3.3475e-05, "loss": 0.0586, "step": 1340 }, { "epoch": 0.35328753680078506, "grad_norm": 0.7685849666595459, "learning_rate": 3.3725e-05, "loss": 0.0572, "step": 1350 }, { "epoch": 0.35590448151782794, "grad_norm": 0.538578450679779, "learning_rate": 3.3975e-05, "loss": 0.0545, "step": 1360 }, { "epoch": 0.35852142623487077, "grad_norm": 0.6326640844345093, "learning_rate": 3.4225e-05, "loss": 0.0589, "step": 1370 }, { "epoch": 0.36113837095191365, "grad_norm": 0.8394850492477417, "learning_rate": 3.4475000000000005e-05, "loss": 0.049, "step": 1380 }, { "epoch": 0.36375531566895647, "grad_norm": 0.7793498635292053, "learning_rate": 3.4725000000000004e-05, "loss": 0.0543, "step": 1390 }, { "epoch": 0.36637226038599935, "grad_norm": 0.8102224469184875, "learning_rate": 3.4975e-05, "loss": 0.0524, "step": 1400 }, { "epoch": 0.3689892051030422, "grad_norm": 0.6351240873336792, "learning_rate": 3.5225e-05, "loss": 0.0514, "step": 1410 }, { "epoch": 0.37160614982008505, "grad_norm": 0.7605561017990112, "learning_rate": 3.5475e-05, "loss": 0.0566, "step": 1420 }, { "epoch": 0.3742230945371279, "grad_norm": 0.5509055852890015, "learning_rate": 3.5725e-05, "loss": 0.0546, "step": 1430 }, { "epoch": 0.37684003925417076, "grad_norm": 0.5754609107971191, "learning_rate": 3.5975e-05, "loss": 0.0564, "step": 1440 }, { "epoch": 0.37945698397121363, "grad_norm": 0.6741311550140381, "learning_rate": 3.6225000000000006e-05, "loss": 0.058, "step": 1450 }, { "epoch": 0.38207392868825646, "grad_norm": 0.7794651985168457, "learning_rate": 3.6475000000000006e-05, "loss": 0.069, "step": 1460 }, { "epoch": 0.38469087340529934, "grad_norm": 0.6684126853942871, "learning_rate": 3.6725000000000005e-05, "loss": 0.0588, "step": 1470 }, { "epoch": 0.38730781812234216, "grad_norm": 0.6147798299789429, "learning_rate": 3.6975000000000004e-05, "loss": 0.0614, "step": 1480 }, { "epoch": 0.38992476283938504, "grad_norm": 0.5154690146446228, "learning_rate": 3.7225000000000004e-05, "loss": 0.0595, "step": 1490 }, { "epoch": 0.39254170755642787, "grad_norm": 0.6523647308349609, "learning_rate": 3.7475e-05, "loss": 0.0597, "step": 1500 }, { "epoch": 0.39515865227347075, "grad_norm": 0.6172918081283569, "learning_rate": 3.7725e-05, "loss": 0.0626, "step": 1510 }, { "epoch": 0.39777559699051357, "grad_norm": 0.41199785470962524, "learning_rate": 3.7975e-05, "loss": 0.0551, "step": 1520 }, { "epoch": 0.40039254170755645, "grad_norm": 0.4674221873283386, "learning_rate": 3.8225e-05, "loss": 0.0631, "step": 1530 }, { "epoch": 0.4030094864245993, "grad_norm": 0.7040983438491821, "learning_rate": 3.8475e-05, "loss": 0.0647, "step": 1540 }, { "epoch": 0.40562643114164215, "grad_norm": 0.912530243396759, "learning_rate": 3.8725e-05, "loss": 0.0643, "step": 1550 }, { "epoch": 0.408243375858685, "grad_norm": 0.6816163063049316, "learning_rate": 3.8975e-05, "loss": 0.0657, "step": 1560 }, { "epoch": 0.41086032057572786, "grad_norm": 0.9752461910247803, "learning_rate": 3.9225e-05, "loss": 0.0705, "step": 1570 }, { "epoch": 0.4134772652927707, "grad_norm": 0.910144031047821, "learning_rate": 3.9475000000000004e-05, "loss": 0.0653, "step": 1580 }, { "epoch": 0.41609421000981356, "grad_norm": 0.611270010471344, "learning_rate": 3.9725e-05, "loss": 0.0612, "step": 1590 }, { "epoch": 0.4187111547268564, "grad_norm": 0.6135270595550537, "learning_rate": 3.9975e-05, "loss": 0.0697, "step": 1600 }, { "epoch": 0.42132809944389926, "grad_norm": 0.6670326590538025, "learning_rate": 4.0225e-05, "loss": 0.0632, "step": 1610 }, { "epoch": 0.4239450441609421, "grad_norm": 0.6105315089225769, "learning_rate": 4.0475e-05, "loss": 0.0604, "step": 1620 }, { "epoch": 0.42656198887798497, "grad_norm": 0.8678605556488037, "learning_rate": 4.0725e-05, "loss": 0.0642, "step": 1630 }, { "epoch": 0.4291789335950278, "grad_norm": 0.7068043351173401, "learning_rate": 4.0975e-05, "loss": 0.0598, "step": 1640 }, { "epoch": 0.43179587831207067, "grad_norm": 0.9618980288505554, "learning_rate": 4.1225e-05, "loss": 0.0617, "step": 1650 }, { "epoch": 0.4344128230291135, "grad_norm": 0.6300309300422668, "learning_rate": 4.1475000000000005e-05, "loss": 0.0567, "step": 1660 }, { "epoch": 0.4370297677461564, "grad_norm": 0.7122251391410828, "learning_rate": 4.1725000000000005e-05, "loss": 0.0617, "step": 1670 }, { "epoch": 0.4396467124631992, "grad_norm": 0.5705268383026123, "learning_rate": 4.1975000000000004e-05, "loss": 0.0634, "step": 1680 }, { "epoch": 0.4422636571802421, "grad_norm": 0.9508737921714783, "learning_rate": 4.2225e-05, "loss": 0.07, "step": 1690 }, { "epoch": 0.4448806018972849, "grad_norm": 0.9834522604942322, "learning_rate": 4.2475e-05, "loss": 0.0727, "step": 1700 }, { "epoch": 0.4474975466143278, "grad_norm": 1.0555498600006104, "learning_rate": 4.2725e-05, "loss": 0.066, "step": 1710 }, { "epoch": 0.4501144913313706, "grad_norm": 1.0131648778915405, "learning_rate": 4.2975e-05, "loss": 0.0628, "step": 1720 }, { "epoch": 0.4527314360484135, "grad_norm": 0.7705113291740417, "learning_rate": 4.322500000000001e-05, "loss": 0.0645, "step": 1730 }, { "epoch": 0.4553483807654563, "grad_norm": 0.6077147722244263, "learning_rate": 4.3475000000000006e-05, "loss": 0.0626, "step": 1740 }, { "epoch": 0.4579653254824992, "grad_norm": 1.067004919052124, "learning_rate": 4.3725000000000006e-05, "loss": 0.0692, "step": 1750 }, { "epoch": 0.460582270199542, "grad_norm": 1.199952483177185, "learning_rate": 4.3975e-05, "loss": 0.065, "step": 1760 }, { "epoch": 0.4631992149165849, "grad_norm": 1.3196426630020142, "learning_rate": 4.4225e-05, "loss": 0.0672, "step": 1770 }, { "epoch": 0.4658161596336277, "grad_norm": 1.0404473543167114, "learning_rate": 4.4475e-05, "loss": 0.0719, "step": 1780 }, { "epoch": 0.4684331043506706, "grad_norm": 0.7192089557647705, "learning_rate": 4.4725e-05, "loss": 0.0709, "step": 1790 }, { "epoch": 0.47105004906771347, "grad_norm": 0.7947105765342712, "learning_rate": 4.4975e-05, "loss": 0.0662, "step": 1800 }, { "epoch": 0.4736669937847563, "grad_norm": 0.8048547506332397, "learning_rate": 4.5225e-05, "loss": 0.0637, "step": 1810 }, { "epoch": 0.4762839385017992, "grad_norm": 0.5961185097694397, "learning_rate": 4.5475e-05, "loss": 0.069, "step": 1820 }, { "epoch": 0.478900883218842, "grad_norm": 0.5182297825813293, "learning_rate": 4.5725e-05, "loss": 0.0621, "step": 1830 }, { "epoch": 0.4815178279358849, "grad_norm": 0.7439857125282288, "learning_rate": 4.5975e-05, "loss": 0.0676, "step": 1840 }, { "epoch": 0.4841347726529277, "grad_norm": 0.5764941573143005, "learning_rate": 4.6225e-05, "loss": 0.0685, "step": 1850 }, { "epoch": 0.4867517173699706, "grad_norm": 0.8225212693214417, "learning_rate": 4.6475000000000005e-05, "loss": 0.0691, "step": 1860 }, { "epoch": 0.4893686620870134, "grad_norm": 0.6442469954490662, "learning_rate": 4.6725000000000004e-05, "loss": 0.0639, "step": 1870 }, { "epoch": 0.4919856068040563, "grad_norm": 0.5695465803146362, "learning_rate": 4.6975000000000003e-05, "loss": 0.0659, "step": 1880 }, { "epoch": 0.4946025515210991, "grad_norm": 0.7405710220336914, "learning_rate": 4.7225e-05, "loss": 0.075, "step": 1890 }, { "epoch": 0.497219496238142, "grad_norm": 0.7905144691467285, "learning_rate": 4.7475e-05, "loss": 0.0655, "step": 1900 }, { "epoch": 0.4998364409551848, "grad_norm": 0.3834249973297119, "learning_rate": 4.7725e-05, "loss": 0.0664, "step": 1910 }, { "epoch": 0.5024533856722276, "grad_norm": 0.7903843522071838, "learning_rate": 4.7975e-05, "loss": 0.0699, "step": 1920 }, { "epoch": 0.5050703303892705, "grad_norm": 1.1027508974075317, "learning_rate": 4.822500000000001e-05, "loss": 0.0717, "step": 1930 }, { "epoch": 0.5076872751063134, "grad_norm": 0.6772246360778809, "learning_rate": 4.8475000000000006e-05, "loss": 0.0742, "step": 1940 }, { "epoch": 0.5103042198233563, "grad_norm": 0.8637296557426453, "learning_rate": 4.8725000000000005e-05, "loss": 0.0719, "step": 1950 }, { "epoch": 0.512921164540399, "grad_norm": 0.7059396505355835, "learning_rate": 4.8975000000000005e-05, "loss": 0.0722, "step": 1960 }, { "epoch": 0.5155381092574419, "grad_norm": 0.6927570104598999, "learning_rate": 4.9225000000000004e-05, "loss": 0.0765, "step": 1970 }, { "epoch": 0.5181550539744848, "grad_norm": 0.8184845447540283, "learning_rate": 4.9475e-05, "loss": 0.0751, "step": 1980 }, { "epoch": 0.5207719986915277, "grad_norm": 0.6966920495033264, "learning_rate": 4.9725e-05, "loss": 0.0743, "step": 1990 }, { "epoch": 0.5233889434085705, "grad_norm": 0.6769987940788269, "learning_rate": 4.9975e-05, "loss": 0.0731, "step": 2000 }, { "epoch": 0.5233889434085705, "eval_loss": 0.07937794011851888, "eval_runtime": 8.4909, "eval_samples_per_second": 120.6, "eval_steps_per_second": 1.884, "step": 2000 }, { "epoch": 0.5260058881256133, "grad_norm": 0.6470193862915039, "learning_rate": 4.99999930796579e-05, "loss": 0.0711, "step": 2010 }, { "epoch": 0.5286228328426562, "grad_norm": 0.8470696210861206, "learning_rate": 4.999996915749259e-05, "loss": 0.077, "step": 2020 }, { "epoch": 0.5312397775596991, "grad_norm": 1.0474553108215332, "learning_rate": 4.99999281480841e-05, "loss": 0.0727, "step": 2030 }, { "epoch": 0.5338567222767419, "grad_norm": 0.7926612496376038, "learning_rate": 4.999987005146045e-05, "loss": 0.071, "step": 2040 }, { "epoch": 0.5364736669937847, "grad_norm": 0.593268096446991, "learning_rate": 4.9999794867661356e-05, "loss": 0.0732, "step": 2050 }, { "epoch": 0.5390906117108276, "grad_norm": 0.7809154391288757, "learning_rate": 4.999970259673821e-05, "loss": 0.076, "step": 2060 }, { "epoch": 0.5417075564278705, "grad_norm": 0.7197739481925964, "learning_rate": 4.999959323875406e-05, "loss": 0.0766, "step": 2070 }, { "epoch": 0.5443245011449133, "grad_norm": 0.5898247361183167, "learning_rate": 4.999946679378368e-05, "loss": 0.0639, "step": 2080 }, { "epoch": 0.5469414458619561, "grad_norm": 1.1134157180786133, "learning_rate": 4.999932326191346e-05, "loss": 0.0764, "step": 2090 }, { "epoch": 0.549558390578999, "grad_norm": 0.8103418946266174, "learning_rate": 4.999916264324153e-05, "loss": 0.0727, "step": 2100 }, { "epoch": 0.5521753352960419, "grad_norm": 0.7031409740447998, "learning_rate": 4.999898493787766e-05, "loss": 0.0673, "step": 2110 }, { "epoch": 0.5547922800130847, "grad_norm": 0.8002007603645325, "learning_rate": 4.999879014594331e-05, "loss": 0.0722, "step": 2120 }, { "epoch": 0.5574092247301276, "grad_norm": 0.6864317655563354, "learning_rate": 4.999857826757162e-05, "loss": 0.0746, "step": 2130 }, { "epoch": 0.5600261694471704, "grad_norm": 0.5315617322921753, "learning_rate": 4.999834930290741e-05, "loss": 0.0682, "step": 2140 }, { "epoch": 0.5626431141642133, "grad_norm": 0.6081879734992981, "learning_rate": 4.9998103252107166e-05, "loss": 0.0693, "step": 2150 }, { "epoch": 0.5652600588812562, "grad_norm": 0.44701310992240906, "learning_rate": 4.999784011533907e-05, "loss": 0.0672, "step": 2160 }, { "epoch": 0.567877003598299, "grad_norm": 0.8048236966133118, "learning_rate": 4.999755989278298e-05, "loss": 0.0681, "step": 2170 }, { "epoch": 0.5704939483153418, "grad_norm": 0.7597357034683228, "learning_rate": 4.99972625846304e-05, "loss": 0.0681, "step": 2180 }, { "epoch": 0.5731108930323847, "grad_norm": 0.6289070248603821, "learning_rate": 4.999694819108456e-05, "loss": 0.0705, "step": 2190 }, { "epoch": 0.5757278377494276, "grad_norm": 1.0394102334976196, "learning_rate": 4.999661671236034e-05, "loss": 0.0823, "step": 2200 }, { "epoch": 0.5783447824664704, "grad_norm": 0.9317861199378967, "learning_rate": 4.99962681486843e-05, "loss": 0.0725, "step": 2210 }, { "epoch": 0.5809617271835132, "grad_norm": 1.0117568969726562, "learning_rate": 4.9995902500294676e-05, "loss": 0.0751, "step": 2220 }, { "epoch": 0.5835786719005561, "grad_norm": 0.7560598254203796, "learning_rate": 4.99955197674414e-05, "loss": 0.0723, "step": 2230 }, { "epoch": 0.586195616617599, "grad_norm": 0.7627548575401306, "learning_rate": 4.999511995038605e-05, "loss": 0.0744, "step": 2240 }, { "epoch": 0.5888125613346418, "grad_norm": 0.5133088827133179, "learning_rate": 4.99947030494019e-05, "loss": 0.071, "step": 2250 }, { "epoch": 0.5914295060516847, "grad_norm": 0.8383373022079468, "learning_rate": 4.9994269064773916e-05, "loss": 0.0816, "step": 2260 }, { "epoch": 0.5940464507687275, "grad_norm": 0.4755108058452606, "learning_rate": 4.9993817996798695e-05, "loss": 0.0709, "step": 2270 }, { "epoch": 0.5966633954857704, "grad_norm": 0.8120405077934265, "learning_rate": 4.999334984578456e-05, "loss": 0.0648, "step": 2280 }, { "epoch": 0.5992803402028132, "grad_norm": 0.67301344871521, "learning_rate": 4.999286461205147e-05, "loss": 0.0734, "step": 2290 }, { "epoch": 0.6018972849198561, "grad_norm": 1.0979734659194946, "learning_rate": 4.9992362295931094e-05, "loss": 0.075, "step": 2300 }, { "epoch": 0.6045142296368989, "grad_norm": 0.9093306660652161, "learning_rate": 4.999184289776675e-05, "loss": 0.0762, "step": 2310 }, { "epoch": 0.6071311743539418, "grad_norm": 1.5600154399871826, "learning_rate": 4.999130641791344e-05, "loss": 0.0744, "step": 2320 }, { "epoch": 0.6097481190709846, "grad_norm": 0.7541422247886658, "learning_rate": 4.9990752856737856e-05, "loss": 0.0827, "step": 2330 }, { "epoch": 0.6123650637880275, "grad_norm": 0.8271247744560242, "learning_rate": 4.9990182214618334e-05, "loss": 0.0688, "step": 2340 }, { "epoch": 0.6149820085050703, "grad_norm": 0.7597739100456238, "learning_rate": 4.9989594491944915e-05, "loss": 0.0743, "step": 2350 }, { "epoch": 0.6175989532221132, "grad_norm": 0.8055959343910217, "learning_rate": 4.9988989689119296e-05, "loss": 0.079, "step": 2360 }, { "epoch": 0.620215897939156, "grad_norm": 1.167317271232605, "learning_rate": 4.9988367806554856e-05, "loss": 0.0697, "step": 2370 }, { "epoch": 0.6228328426561989, "grad_norm": 0.5930063128471375, "learning_rate": 4.9987728844676657e-05, "loss": 0.0755, "step": 2380 }, { "epoch": 0.6254497873732418, "grad_norm": 0.9787731170654297, "learning_rate": 4.998707280392141e-05, "loss": 0.0759, "step": 2390 }, { "epoch": 0.6280667320902846, "grad_norm": 0.7472729682922363, "learning_rate": 4.998639968473751e-05, "loss": 0.0741, "step": 2400 }, { "epoch": 0.6306836768073274, "grad_norm": 0.6840028166770935, "learning_rate": 4.998570948758503e-05, "loss": 0.0692, "step": 2410 }, { "epoch": 0.6333006215243703, "grad_norm": 0.7901447415351868, "learning_rate": 4.998500221293572e-05, "loss": 0.0843, "step": 2420 }, { "epoch": 0.6359175662414132, "grad_norm": 0.6261860132217407, "learning_rate": 4.9984277861273e-05, "loss": 0.0709, "step": 2430 }, { "epoch": 0.638534510958456, "grad_norm": 0.6743974089622498, "learning_rate": 4.9983536433091936e-05, "loss": 0.0739, "step": 2440 }, { "epoch": 0.6411514556754988, "grad_norm": 0.9257513880729675, "learning_rate": 4.998277792889931e-05, "loss": 0.0783, "step": 2450 }, { "epoch": 0.6437684003925417, "grad_norm": 0.5694191455841064, "learning_rate": 4.998200234921354e-05, "loss": 0.0694, "step": 2460 }, { "epoch": 0.6463853451095846, "grad_norm": 0.8103319406509399, "learning_rate": 4.9981209694564726e-05, "loss": 0.076, "step": 2470 }, { "epoch": 0.6490022898266274, "grad_norm": 0.5619904398918152, "learning_rate": 4.998039996549465e-05, "loss": 0.0706, "step": 2480 }, { "epoch": 0.6516192345436702, "grad_norm": 1.2966636419296265, "learning_rate": 4.997957316255675e-05, "loss": 0.0772, "step": 2490 }, { "epoch": 0.6542361792607131, "grad_norm": 0.7027465105056763, "learning_rate": 4.997872928631613e-05, "loss": 0.0746, "step": 2500 }, { "epoch": 0.656853123977756, "grad_norm": 0.8370850682258606, "learning_rate": 4.997786833734959e-05, "loss": 0.0762, "step": 2510 }, { "epoch": 0.6594700686947988, "grad_norm": 0.8642465472221375, "learning_rate": 4.997699031624556e-05, "loss": 0.0747, "step": 2520 }, { "epoch": 0.6620870134118417, "grad_norm": 0.7573094964027405, "learning_rate": 4.9976095223604166e-05, "loss": 0.0742, "step": 2530 }, { "epoch": 0.6647039581288845, "grad_norm": 0.8384932279586792, "learning_rate": 4.99751830600372e-05, "loss": 0.0675, "step": 2540 }, { "epoch": 0.6673209028459274, "grad_norm": 0.8319125771522522, "learning_rate": 4.997425382616812e-05, "loss": 0.0792, "step": 2550 }, { "epoch": 0.6699378475629703, "grad_norm": 0.8607365489006042, "learning_rate": 4.997330752263203e-05, "loss": 0.0796, "step": 2560 }, { "epoch": 0.6725547922800131, "grad_norm": 0.699475884437561, "learning_rate": 4.997234415007574e-05, "loss": 0.067, "step": 2570 }, { "epoch": 0.6751717369970559, "grad_norm": 0.9081335067749023, "learning_rate": 4.99713637091577e-05, "loss": 0.0774, "step": 2580 }, { "epoch": 0.6777886817140988, "grad_norm": 0.5712231993675232, "learning_rate": 4.997036620054803e-05, "loss": 0.0644, "step": 2590 }, { "epoch": 0.6804056264311417, "grad_norm": 0.9103120565414429, "learning_rate": 4.996935162492852e-05, "loss": 0.0769, "step": 2600 }, { "epoch": 0.6830225711481845, "grad_norm": 0.6050474643707275, "learning_rate": 4.996831998299262e-05, "loss": 0.0784, "step": 2610 }, { "epoch": 0.6856395158652273, "grad_norm": 0.5857220888137817, "learning_rate": 4.9967271275445444e-05, "loss": 0.0736, "step": 2620 }, { "epoch": 0.6882564605822702, "grad_norm": 0.5730482339859009, "learning_rate": 4.996620550300378e-05, "loss": 0.0727, "step": 2630 }, { "epoch": 0.6908734052993131, "grad_norm": 0.791704535484314, "learning_rate": 4.996512266639608e-05, "loss": 0.0809, "step": 2640 }, { "epoch": 0.693490350016356, "grad_norm": 0.6655763983726501, "learning_rate": 4.9964022766362436e-05, "loss": 0.0793, "step": 2650 }, { "epoch": 0.6961072947333987, "grad_norm": 0.9879754185676575, "learning_rate": 4.996290580365463e-05, "loss": 0.0751, "step": 2660 }, { "epoch": 0.6987242394504416, "grad_norm": 0.8520436882972717, "learning_rate": 4.996177177903609e-05, "loss": 0.0739, "step": 2670 }, { "epoch": 0.7013411841674845, "grad_norm": 0.7003749012947083, "learning_rate": 4.9960620693281924e-05, "loss": 0.0761, "step": 2680 }, { "epoch": 0.7039581288845274, "grad_norm": 0.8380037546157837, "learning_rate": 4.995945254717887e-05, "loss": 0.0759, "step": 2690 }, { "epoch": 0.7065750736015701, "grad_norm": 0.756373941898346, "learning_rate": 4.9958267341525353e-05, "loss": 0.073, "step": 2700 }, { "epoch": 0.709192018318613, "grad_norm": 0.6666063070297241, "learning_rate": 4.995706507713146e-05, "loss": 0.0712, "step": 2710 }, { "epoch": 0.7118089630356559, "grad_norm": 0.743940532207489, "learning_rate": 4.9955845754818906e-05, "loss": 0.0746, "step": 2720 }, { "epoch": 0.7144259077526988, "grad_norm": 0.5994930267333984, "learning_rate": 4.9954609375421105e-05, "loss": 0.0733, "step": 2730 }, { "epoch": 0.7170428524697415, "grad_norm": 0.6286001205444336, "learning_rate": 4.995335593978311e-05, "loss": 0.0684, "step": 2740 }, { "epoch": 0.7196597971867844, "grad_norm": 0.6213799715042114, "learning_rate": 4.995208544876162e-05, "loss": 0.0684, "step": 2750 }, { "epoch": 0.7222767419038273, "grad_norm": 0.5822627544403076, "learning_rate": 4.9950797903225006e-05, "loss": 0.0684, "step": 2760 }, { "epoch": 0.7248936866208702, "grad_norm": 0.7613430619239807, "learning_rate": 4.99494933040533e-05, "loss": 0.0731, "step": 2770 }, { "epoch": 0.7275106313379129, "grad_norm": 1.0139579772949219, "learning_rate": 4.994817165213818e-05, "loss": 0.0701, "step": 2780 }, { "epoch": 0.7301275760549558, "grad_norm": 0.5469045639038086, "learning_rate": 4.994683294838298e-05, "loss": 0.0716, "step": 2790 }, { "epoch": 0.7327445207719987, "grad_norm": 0.6303797960281372, "learning_rate": 4.99454771937027e-05, "loss": 0.0744, "step": 2800 }, { "epoch": 0.7353614654890416, "grad_norm": 0.5600771307945251, "learning_rate": 4.994410438902398e-05, "loss": 0.081, "step": 2810 }, { "epoch": 0.7379784102060843, "grad_norm": 0.6783274412155151, "learning_rate": 4.994271453528511e-05, "loss": 0.0695, "step": 2820 }, { "epoch": 0.7405953549231272, "grad_norm": 0.7181297540664673, "learning_rate": 4.994130763343606e-05, "loss": 0.0704, "step": 2830 }, { "epoch": 0.7432122996401701, "grad_norm": 0.6178589463233948, "learning_rate": 4.993988368443843e-05, "loss": 0.0711, "step": 2840 }, { "epoch": 0.745829244357213, "grad_norm": 0.40977856516838074, "learning_rate": 4.9938442689265456e-05, "loss": 0.0726, "step": 2850 }, { "epoch": 0.7484461890742558, "grad_norm": 0.6271493434906006, "learning_rate": 4.9936984648902064e-05, "loss": 0.0821, "step": 2860 }, { "epoch": 0.7510631337912986, "grad_norm": 0.5872370600700378, "learning_rate": 4.993550956434481e-05, "loss": 0.0742, "step": 2870 }, { "epoch": 0.7536800785083415, "grad_norm": 0.5435254573822021, "learning_rate": 4.99340174366019e-05, "loss": 0.0703, "step": 2880 }, { "epoch": 0.7562970232253844, "grad_norm": 0.6672695875167847, "learning_rate": 4.993250826669318e-05, "loss": 0.0786, "step": 2890 }, { "epoch": 0.7589139679424273, "grad_norm": 0.6329795718193054, "learning_rate": 4.993098205565016e-05, "loss": 0.0695, "step": 2900 }, { "epoch": 0.76153091265947, "grad_norm": 0.47968053817749023, "learning_rate": 4.992943880451599e-05, "loss": 0.0672, "step": 2910 }, { "epoch": 0.7641478573765129, "grad_norm": 0.5885211825370789, "learning_rate": 4.992787851434546e-05, "loss": 0.0786, "step": 2920 }, { "epoch": 0.7667648020935558, "grad_norm": 0.8792977929115295, "learning_rate": 4.992630118620504e-05, "loss": 0.0769, "step": 2930 }, { "epoch": 0.7693817468105987, "grad_norm": 1.083542823791504, "learning_rate": 4.9924706821172784e-05, "loss": 0.0802, "step": 2940 }, { "epoch": 0.7719986915276414, "grad_norm": 0.811312198638916, "learning_rate": 4.992309542033845e-05, "loss": 0.0735, "step": 2950 }, { "epoch": 0.7746156362446843, "grad_norm": 0.9727795124053955, "learning_rate": 4.99214669848034e-05, "loss": 0.073, "step": 2960 }, { "epoch": 0.7772325809617272, "grad_norm": 0.8784989714622498, "learning_rate": 4.9919821515680665e-05, "loss": 0.0699, "step": 2970 }, { "epoch": 0.7798495256787701, "grad_norm": 0.744993269443512, "learning_rate": 4.9918159014094906e-05, "loss": 0.0714, "step": 2980 }, { "epoch": 0.7824664703958129, "grad_norm": 0.7093546390533447, "learning_rate": 4.991647948118242e-05, "loss": 0.0703, "step": 2990 }, { "epoch": 0.7850834151128557, "grad_norm": 0.6334063410758972, "learning_rate": 4.991478291809116e-05, "loss": 0.0683, "step": 3000 }, { "epoch": 0.7850834151128557, "eval_loss": 0.07849323315593094, "eval_runtime": 8.5756, "eval_samples_per_second": 119.408, "eval_steps_per_second": 1.866, "step": 3000 }, { "epoch": 0.7877003598298986, "grad_norm": 0.7186859846115112, "learning_rate": 4.991306932598071e-05, "loss": 0.0701, "step": 3010 }, { "epoch": 0.7903173045469415, "grad_norm": 0.7815176844596863, "learning_rate": 4.991133870602229e-05, "loss": 0.0702, "step": 3020 }, { "epoch": 0.7929342492639843, "grad_norm": 0.8299248814582825, "learning_rate": 4.9909591059398764e-05, "loss": 0.0637, "step": 3030 }, { "epoch": 0.7955511939810271, "grad_norm": 0.6321790814399719, "learning_rate": 4.990782638730464e-05, "loss": 0.0734, "step": 3040 }, { "epoch": 0.79816813869807, "grad_norm": 0.7430514097213745, "learning_rate": 4.990604469094603e-05, "loss": 0.0657, "step": 3050 }, { "epoch": 0.8007850834151129, "grad_norm": 0.9191706776618958, "learning_rate": 4.9904245971540745e-05, "loss": 0.0718, "step": 3060 }, { "epoch": 0.8034020281321557, "grad_norm": 0.6048433184623718, "learning_rate": 4.990243023031815e-05, "loss": 0.0688, "step": 3070 }, { "epoch": 0.8060189728491985, "grad_norm": 0.5070809721946716, "learning_rate": 4.990059746851932e-05, "loss": 0.0687, "step": 3080 }, { "epoch": 0.8086359175662414, "grad_norm": 0.8282752633094788, "learning_rate": 4.9898747687396916e-05, "loss": 0.0788, "step": 3090 }, { "epoch": 0.8112528622832843, "grad_norm": 0.6389812231063843, "learning_rate": 4.9896880888215254e-05, "loss": 0.0782, "step": 3100 }, { "epoch": 0.8138698070003271, "grad_norm": 1.0264029502868652, "learning_rate": 4.989499707225026e-05, "loss": 0.0738, "step": 3110 }, { "epoch": 0.81648675171737, "grad_norm": 0.8186160326004028, "learning_rate": 4.989309624078952e-05, "loss": 0.0732, "step": 3120 }, { "epoch": 0.8191036964344128, "grad_norm": 0.7047196626663208, "learning_rate": 4.9891178395132224e-05, "loss": 0.0764, "step": 3130 }, { "epoch": 0.8217206411514557, "grad_norm": 0.7498276233673096, "learning_rate": 4.98892435365892e-05, "loss": 0.0755, "step": 3140 }, { "epoch": 0.8243375858684985, "grad_norm": 0.8340325951576233, "learning_rate": 4.988729166648292e-05, "loss": 0.073, "step": 3150 }, { "epoch": 0.8269545305855414, "grad_norm": 0.7537565231323242, "learning_rate": 4.988532278614746e-05, "loss": 0.0693, "step": 3160 }, { "epoch": 0.8295714753025842, "grad_norm": 0.5033385157585144, "learning_rate": 4.988333689692852e-05, "loss": 0.0693, "step": 3170 }, { "epoch": 0.8321884200196271, "grad_norm": 0.5596148371696472, "learning_rate": 4.988133400018345e-05, "loss": 0.0698, "step": 3180 }, { "epoch": 0.8348053647366699, "grad_norm": 0.9153453707695007, "learning_rate": 4.987931409728121e-05, "loss": 0.0697, "step": 3190 }, { "epoch": 0.8374223094537128, "grad_norm": 0.8965064287185669, "learning_rate": 4.9877277189602384e-05, "loss": 0.0772, "step": 3200 }, { "epoch": 0.8400392541707556, "grad_norm": 0.4553247392177582, "learning_rate": 4.987522327853917e-05, "loss": 0.0744, "step": 3210 }, { "epoch": 0.8426561988877985, "grad_norm": 0.9072389006614685, "learning_rate": 4.987315236549541e-05, "loss": 0.0706, "step": 3220 }, { "epoch": 0.8452731436048413, "grad_norm": 0.7078306674957275, "learning_rate": 4.9871064451886554e-05, "loss": 0.0627, "step": 3230 }, { "epoch": 0.8478900883218842, "grad_norm": 0.5441957712173462, "learning_rate": 4.986895953913966e-05, "loss": 0.0692, "step": 3240 }, { "epoch": 0.850507033038927, "grad_norm": 0.4400569796562195, "learning_rate": 4.9866837628693416e-05, "loss": 0.0669, "step": 3250 }, { "epoch": 0.8531239777559699, "grad_norm": 0.5146138072013855, "learning_rate": 4.9864698721998136e-05, "loss": 0.0729, "step": 3260 }, { "epoch": 0.8557409224730128, "grad_norm": 0.6510049700737, "learning_rate": 4.986254282051575e-05, "loss": 0.0712, "step": 3270 }, { "epoch": 0.8583578671900556, "grad_norm": 0.698809027671814, "learning_rate": 4.986036992571978e-05, "loss": 0.0736, "step": 3280 }, { "epoch": 0.8609748119070985, "grad_norm": 0.6784025430679321, "learning_rate": 4.985818003909537e-05, "loss": 0.0825, "step": 3290 }, { "epoch": 0.8635917566241413, "grad_norm": 0.7913259267807007, "learning_rate": 4.9855973162139316e-05, "loss": 0.0721, "step": 3300 }, { "epoch": 0.8662087013411842, "grad_norm": 0.7243340611457825, "learning_rate": 4.985374929635998e-05, "loss": 0.0707, "step": 3310 }, { "epoch": 0.868825646058227, "grad_norm": 0.5413879156112671, "learning_rate": 4.985150844327736e-05, "loss": 0.0779, "step": 3320 }, { "epoch": 0.8714425907752699, "grad_norm": 0.87245112657547, "learning_rate": 4.984925060442306e-05, "loss": 0.0687, "step": 3330 }, { "epoch": 0.8740595354923127, "grad_norm": 0.6047831773757935, "learning_rate": 4.9846975781340274e-05, "loss": 0.0728, "step": 3340 }, { "epoch": 0.8766764802093556, "grad_norm": 0.5928580164909363, "learning_rate": 4.984468397558384e-05, "loss": 0.0729, "step": 3350 }, { "epoch": 0.8792934249263984, "grad_norm": 0.68870609998703, "learning_rate": 4.984237518872018e-05, "loss": 0.072, "step": 3360 }, { "epoch": 0.8819103696434413, "grad_norm": 0.6248053908348083, "learning_rate": 4.9840049422327325e-05, "loss": 0.0787, "step": 3370 }, { "epoch": 0.8845273143604842, "grad_norm": 0.589297890663147, "learning_rate": 4.983770667799492e-05, "loss": 0.066, "step": 3380 }, { "epoch": 0.887144259077527, "grad_norm": 0.7025781273841858, "learning_rate": 4.9835346957324206e-05, "loss": 0.0675, "step": 3390 }, { "epoch": 0.8897612037945698, "grad_norm": 0.7033712267875671, "learning_rate": 4.983297026192804e-05, "loss": 0.081, "step": 3400 }, { "epoch": 0.8923781485116127, "grad_norm": 0.5622373223304749, "learning_rate": 4.983057659343085e-05, "loss": 0.0617, "step": 3410 }, { "epoch": 0.8949950932286556, "grad_norm": 0.8491391539573669, "learning_rate": 4.98281659534687e-05, "loss": 0.0623, "step": 3420 }, { "epoch": 0.8976120379456984, "grad_norm": 0.6941475868225098, "learning_rate": 4.982573834368923e-05, "loss": 0.0722, "step": 3430 }, { "epoch": 0.9002289826627412, "grad_norm": 0.6664544939994812, "learning_rate": 4.98232937657517e-05, "loss": 0.0772, "step": 3440 }, { "epoch": 0.9028459273797841, "grad_norm": 0.6295298933982849, "learning_rate": 4.982083222132695e-05, "loss": 0.0746, "step": 3450 }, { "epoch": 0.905462872096827, "grad_norm": 0.5968911051750183, "learning_rate": 4.981835371209742e-05, "loss": 0.0769, "step": 3460 }, { "epoch": 0.9080798168138698, "grad_norm": 0.5266377925872803, "learning_rate": 4.981585823975715e-05, "loss": 0.0645, "step": 3470 }, { "epoch": 0.9106967615309126, "grad_norm": 0.965641975402832, "learning_rate": 4.981334580601178e-05, "loss": 0.0795, "step": 3480 }, { "epoch": 0.9133137062479555, "grad_norm": 0.6652019619941711, "learning_rate": 4.9810816412578525e-05, "loss": 0.0669, "step": 3490 }, { "epoch": 0.9159306509649984, "grad_norm": 0.921328067779541, "learning_rate": 4.9808270061186204e-05, "loss": 0.0722, "step": 3500 }, { "epoch": 0.9185475956820413, "grad_norm": 0.6966074705123901, "learning_rate": 4.980570675357522e-05, "loss": 0.0663, "step": 3510 }, { "epoch": 0.921164540399084, "grad_norm": 0.7490382194519043, "learning_rate": 4.980312649149758e-05, "loss": 0.0764, "step": 3520 }, { "epoch": 0.9237814851161269, "grad_norm": 0.6709177494049072, "learning_rate": 4.980052927671686e-05, "loss": 0.0743, "step": 3530 }, { "epoch": 0.9263984298331698, "grad_norm": 0.81973797082901, "learning_rate": 4.9797915111008236e-05, "loss": 0.0705, "step": 3540 }, { "epoch": 0.9290153745502127, "grad_norm": 0.8974931240081787, "learning_rate": 4.979528399615846e-05, "loss": 0.0703, "step": 3550 }, { "epoch": 0.9316323192672554, "grad_norm": 1.0583653450012207, "learning_rate": 4.979263593396588e-05, "loss": 0.0726, "step": 3560 }, { "epoch": 0.9342492639842983, "grad_norm": 0.7668379545211792, "learning_rate": 4.978997092624043e-05, "loss": 0.0683, "step": 3570 }, { "epoch": 0.9368662087013412, "grad_norm": 0.8339925408363342, "learning_rate": 4.978728897480359e-05, "loss": 0.0753, "step": 3580 }, { "epoch": 0.9394831534183841, "grad_norm": 0.6908124685287476, "learning_rate": 4.978459008148847e-05, "loss": 0.0719, "step": 3590 }, { "epoch": 0.9421000981354269, "grad_norm": 0.6138983964920044, "learning_rate": 4.978187424813974e-05, "loss": 0.0767, "step": 3600 }, { "epoch": 0.9447170428524697, "grad_norm": 0.6764789819717407, "learning_rate": 4.977914147661364e-05, "loss": 0.07, "step": 3610 }, { "epoch": 0.9473339875695126, "grad_norm": 0.5771927833557129, "learning_rate": 4.977639176877799e-05, "loss": 0.0647, "step": 3620 }, { "epoch": 0.9499509322865555, "grad_norm": 0.5599648952484131, "learning_rate": 4.977362512651219e-05, "loss": 0.0663, "step": 3630 }, { "epoch": 0.9525678770035984, "grad_norm": 0.7062062621116638, "learning_rate": 4.9770841551707226e-05, "loss": 0.0699, "step": 3640 }, { "epoch": 0.9551848217206411, "grad_norm": 0.4688286781311035, "learning_rate": 4.976804104626563e-05, "loss": 0.0721, "step": 3650 }, { "epoch": 0.957801766437684, "grad_norm": 0.5337697863578796, "learning_rate": 4.9765223612101534e-05, "loss": 0.0656, "step": 3660 }, { "epoch": 0.9604187111547269, "grad_norm": 0.619052529335022, "learning_rate": 4.976238925114062e-05, "loss": 0.0707, "step": 3670 }, { "epoch": 0.9630356558717698, "grad_norm": 0.6267631649971008, "learning_rate": 4.975953796532015e-05, "loss": 0.0655, "step": 3680 }, { "epoch": 0.9656526005888125, "grad_norm": 0.5680310726165771, "learning_rate": 4.9756669756588944e-05, "loss": 0.0677, "step": 3690 }, { "epoch": 0.9682695453058554, "grad_norm": 0.6664942502975464, "learning_rate": 4.9753784626907395e-05, "loss": 0.0721, "step": 3700 }, { "epoch": 0.9708864900228983, "grad_norm": 0.7929763197898865, "learning_rate": 4.975088257824748e-05, "loss": 0.0685, "step": 3710 }, { "epoch": 0.9735034347399412, "grad_norm": 0.6787649393081665, "learning_rate": 4.974796361259271e-05, "loss": 0.0681, "step": 3720 }, { "epoch": 0.9761203794569839, "grad_norm": 0.7067198753356934, "learning_rate": 4.974502773193816e-05, "loss": 0.066, "step": 3730 }, { "epoch": 0.9787373241740268, "grad_norm": 0.6574423313140869, "learning_rate": 4.974207493829049e-05, "loss": 0.0697, "step": 3740 }, { "epoch": 0.9813542688910697, "grad_norm": 0.6540109515190125, "learning_rate": 4.97391052336679e-05, "loss": 0.0734, "step": 3750 }, { "epoch": 0.9839712136081126, "grad_norm": 0.6842678785324097, "learning_rate": 4.973611862010017e-05, "loss": 0.0699, "step": 3760 }, { "epoch": 0.9865881583251553, "grad_norm": 0.6524052619934082, "learning_rate": 4.97331150996286e-05, "loss": 0.0724, "step": 3770 }, { "epoch": 0.9892051030421982, "grad_norm": 0.7590875625610352, "learning_rate": 4.973009467430608e-05, "loss": 0.0681, "step": 3780 }, { "epoch": 0.9918220477592411, "grad_norm": 1.2194914817810059, "learning_rate": 4.9727057346197046e-05, "loss": 0.0757, "step": 3790 }, { "epoch": 0.994438992476284, "grad_norm": 0.7816717624664307, "learning_rate": 4.9724003117377484e-05, "loss": 0.0743, "step": 3800 }, { "epoch": 0.9970559371933267, "grad_norm": 0.5530228614807129, "learning_rate": 4.972093198993492e-05, "loss": 0.063, "step": 3810 }, { "epoch": 0.9996728819103696, "grad_norm": 0.6032941341400146, "learning_rate": 4.971784396596843e-05, "loss": 0.07, "step": 3820 }, { "epoch": 1.0020935557736343, "grad_norm": 0.7758064866065979, "learning_rate": 4.971473904758868e-05, "loss": 0.0711, "step": 3830 }, { "epoch": 1.0047105004906771, "grad_norm": 1.45048189163208, "learning_rate": 4.971161723691784e-05, "loss": 0.0701, "step": 3840 }, { "epoch": 1.0073274452077199, "grad_norm": 1.0233992338180542, "learning_rate": 4.9708478536089626e-05, "loss": 0.0687, "step": 3850 }, { "epoch": 1.0099443899247629, "grad_norm": 1.0815229415893555, "learning_rate": 4.9705322947249325e-05, "loss": 0.0716, "step": 3860 }, { "epoch": 1.0125613346418056, "grad_norm": 0.981233537197113, "learning_rate": 4.970215047255374e-05, "loss": 0.0733, "step": 3870 }, { "epoch": 1.0151782793588486, "grad_norm": 1.0590592622756958, "learning_rate": 4.969896111417124e-05, "loss": 0.0668, "step": 3880 }, { "epoch": 1.0177952240758914, "grad_norm": 1.0366718769073486, "learning_rate": 4.969575487428171e-05, "loss": 0.0696, "step": 3890 }, { "epoch": 1.0204121687929342, "grad_norm": 0.6779992580413818, "learning_rate": 4.96925317550766e-05, "loss": 0.0688, "step": 3900 }, { "epoch": 1.0230291135099772, "grad_norm": 0.4735548496246338, "learning_rate": 4.9689291758758874e-05, "loss": 0.0703, "step": 3910 }, { "epoch": 1.02564605822702, "grad_norm": 0.6989257335662842, "learning_rate": 4.968603488754302e-05, "loss": 0.0771, "step": 3920 }, { "epoch": 1.0282630029440627, "grad_norm": 0.7250983119010925, "learning_rate": 4.968276114365511e-05, "loss": 0.0706, "step": 3930 }, { "epoch": 1.0308799476611057, "grad_norm": 0.9411719441413879, "learning_rate": 4.96794705293327e-05, "loss": 0.0697, "step": 3940 }, { "epoch": 1.0334968923781485, "grad_norm": 0.7651953101158142, "learning_rate": 4.96761630468249e-05, "loss": 0.0711, "step": 3950 }, { "epoch": 1.0361138370951914, "grad_norm": 0.4986521303653717, "learning_rate": 4.967283869839233e-05, "loss": 0.0749, "step": 3960 }, { "epoch": 1.0387307818122342, "grad_norm": 0.8632932901382446, "learning_rate": 4.966949748630716e-05, "loss": 0.0715, "step": 3970 }, { "epoch": 1.041347726529277, "grad_norm": 0.6957460641860962, "learning_rate": 4.966613941285308e-05, "loss": 0.0827, "step": 3980 }, { "epoch": 1.04396467124632, "grad_norm": 0.61822909116745, "learning_rate": 4.966276448032531e-05, "loss": 0.0744, "step": 3990 }, { "epoch": 1.0465816159633627, "grad_norm": 0.9637846946716309, "learning_rate": 4.965937269103057e-05, "loss": 0.0755, "step": 4000 }, { "epoch": 1.0465816159633627, "eval_loss": 0.07746633274647106, "eval_runtime": 8.6612, "eval_samples_per_second": 118.229, "eval_steps_per_second": 1.847, "step": 4000 }, { "epoch": 1.0491985606804057, "grad_norm": 0.7809803485870361, "learning_rate": 4.9655964047287114e-05, "loss": 0.0753, "step": 4010 }, { "epoch": 1.0518155053974485, "grad_norm": 0.6005727052688599, "learning_rate": 4.965253855142472e-05, "loss": 0.0677, "step": 4020 }, { "epoch": 1.0544324501144913, "grad_norm": 0.8859987258911133, "learning_rate": 4.96490962057847e-05, "loss": 0.0701, "step": 4030 }, { "epoch": 1.0570493948315343, "grad_norm": 0.49039119482040405, "learning_rate": 4.964563701271984e-05, "loss": 0.0684, "step": 4040 }, { "epoch": 1.059666339548577, "grad_norm": 0.5293424725532532, "learning_rate": 4.964216097459448e-05, "loss": 0.0603, "step": 4050 }, { "epoch": 1.0622832842656198, "grad_norm": 0.535614550113678, "learning_rate": 4.9638668093784445e-05, "loss": 0.0701, "step": 4060 }, { "epoch": 1.0649002289826628, "grad_norm": 0.7680052518844604, "learning_rate": 4.96351583726771e-05, "loss": 0.0644, "step": 4070 }, { "epoch": 1.0675171736997056, "grad_norm": 0.6915044784545898, "learning_rate": 4.9631631813671314e-05, "loss": 0.0728, "step": 4080 }, { "epoch": 1.0701341184167485, "grad_norm": 0.5453565120697021, "learning_rate": 4.962808841917744e-05, "loss": 0.0691, "step": 4090 }, { "epoch": 1.0727510631337913, "grad_norm": 0.5886152982711792, "learning_rate": 4.962452819161736e-05, "loss": 0.0719, "step": 4100 }, { "epoch": 1.075368007850834, "grad_norm": 0.747417151927948, "learning_rate": 4.962095113342445e-05, "loss": 0.0719, "step": 4110 }, { "epoch": 1.077984952567877, "grad_norm": 0.9508110284805298, "learning_rate": 4.9617357247043616e-05, "loss": 0.0723, "step": 4120 }, { "epoch": 1.0806018972849198, "grad_norm": 0.8973356485366821, "learning_rate": 4.961374653493122e-05, "loss": 0.0643, "step": 4130 }, { "epoch": 1.0832188420019626, "grad_norm": 0.740419328212738, "learning_rate": 4.9610118999555165e-05, "loss": 0.0765, "step": 4140 }, { "epoch": 1.0858357867190056, "grad_norm": 0.7163695096969604, "learning_rate": 4.960647464339484e-05, "loss": 0.0688, "step": 4150 }, { "epoch": 1.0884527314360484, "grad_norm": 0.5579230189323425, "learning_rate": 4.960281346894111e-05, "loss": 0.0732, "step": 4160 }, { "epoch": 1.0910696761530914, "grad_norm": 0.8381157517433167, "learning_rate": 4.959913547869637e-05, "loss": 0.0756, "step": 4170 }, { "epoch": 1.0936866208701341, "grad_norm": 0.8548111319541931, "learning_rate": 4.959544067517449e-05, "loss": 0.0694, "step": 4180 }, { "epoch": 1.096303565587177, "grad_norm": 0.5482849478721619, "learning_rate": 4.959172906090082e-05, "loss": 0.0672, "step": 4190 }, { "epoch": 1.0989205103042199, "grad_norm": 0.6070835590362549, "learning_rate": 4.958800063841223e-05, "loss": 0.0639, "step": 4200 }, { "epoch": 1.1015374550212627, "grad_norm": 0.7139147520065308, "learning_rate": 4.958425541025705e-05, "loss": 0.0659, "step": 4210 }, { "epoch": 1.1041543997383054, "grad_norm": 0.49353158473968506, "learning_rate": 4.958049337899512e-05, "loss": 0.058, "step": 4220 }, { "epoch": 1.1067713444553484, "grad_norm": 0.7460055351257324, "learning_rate": 4.957671454719774e-05, "loss": 0.0652, "step": 4230 }, { "epoch": 1.1093882891723912, "grad_norm": 0.8104005455970764, "learning_rate": 4.9572918917447715e-05, "loss": 0.0669, "step": 4240 }, { "epoch": 1.1120052338894342, "grad_norm": 0.6421072483062744, "learning_rate": 4.956910649233931e-05, "loss": 0.066, "step": 4250 }, { "epoch": 1.114622178606477, "grad_norm": 0.5011855363845825, "learning_rate": 4.9565277274478304e-05, "loss": 0.0734, "step": 4260 }, { "epoch": 1.1172391233235197, "grad_norm": 0.46482545137405396, "learning_rate": 4.9561431266481906e-05, "loss": 0.0621, "step": 4270 }, { "epoch": 1.1198560680405627, "grad_norm": 0.545685887336731, "learning_rate": 4.955756847097884e-05, "loss": 0.0695, "step": 4280 }, { "epoch": 1.1224730127576055, "grad_norm": 0.6811407208442688, "learning_rate": 4.9553688890609296e-05, "loss": 0.0691, "step": 4290 }, { "epoch": 1.1250899574746485, "grad_norm": 0.8716004490852356, "learning_rate": 4.954979252802492e-05, "loss": 0.0733, "step": 4300 }, { "epoch": 1.1277069021916912, "grad_norm": 0.8074702024459839, "learning_rate": 4.9545879385888836e-05, "loss": 0.0739, "step": 4310 }, { "epoch": 1.130323846908734, "grad_norm": 0.9537233710289001, "learning_rate": 4.9541949466875644e-05, "loss": 0.0687, "step": 4320 }, { "epoch": 1.132940791625777, "grad_norm": 0.8007683753967285, "learning_rate": 4.9538002773671415e-05, "loss": 0.0681, "step": 4330 }, { "epoch": 1.1355577363428198, "grad_norm": 0.8221684098243713, "learning_rate": 4.953403930897367e-05, "loss": 0.0741, "step": 4340 }, { "epoch": 1.1381746810598625, "grad_norm": 0.4814731180667877, "learning_rate": 4.9530059075491395e-05, "loss": 0.0714, "step": 4350 }, { "epoch": 1.1407916257769055, "grad_norm": 0.6261118054389954, "learning_rate": 4.952606207594505e-05, "loss": 0.0674, "step": 4360 }, { "epoch": 1.1434085704939483, "grad_norm": 0.6069873571395874, "learning_rate": 4.952204831306654e-05, "loss": 0.0666, "step": 4370 }, { "epoch": 1.146025515210991, "grad_norm": 0.5223729610443115, "learning_rate": 4.9518017789599244e-05, "loss": 0.0745, "step": 4380 }, { "epoch": 1.148642459928034, "grad_norm": 0.5413695573806763, "learning_rate": 4.951397050829797e-05, "loss": 0.0696, "step": 4390 }, { "epoch": 1.1512594046450768, "grad_norm": 0.7739647626876831, "learning_rate": 4.9509906471929016e-05, "loss": 0.0755, "step": 4400 }, { "epoch": 1.1538763493621198, "grad_norm": 0.8038027286529541, "learning_rate": 4.950582568327009e-05, "loss": 0.0686, "step": 4410 }, { "epoch": 1.1564932940791626, "grad_norm": 0.5534743070602417, "learning_rate": 4.9501728145110395e-05, "loss": 0.0751, "step": 4420 }, { "epoch": 1.1591102387962053, "grad_norm": 0.5017138123512268, "learning_rate": 4.949761386025055e-05, "loss": 0.0745, "step": 4430 }, { "epoch": 1.1617271835132483, "grad_norm": 0.5460235476493835, "learning_rate": 4.949348283150263e-05, "loss": 0.067, "step": 4440 }, { "epoch": 1.164344128230291, "grad_norm": 0.7793023586273193, "learning_rate": 4.948933506169016e-05, "loss": 0.0689, "step": 4450 }, { "epoch": 1.166961072947334, "grad_norm": 0.7321499586105347, "learning_rate": 4.9485170553648086e-05, "loss": 0.0679, "step": 4460 }, { "epoch": 1.1695780176643769, "grad_norm": 0.793841540813446, "learning_rate": 4.948098931022282e-05, "loss": 0.066, "step": 4470 }, { "epoch": 1.1721949623814196, "grad_norm": 0.8915519118309021, "learning_rate": 4.947679133427221e-05, "loss": 0.0672, "step": 4480 }, { "epoch": 1.1748119070984626, "grad_norm": 0.5249608159065247, "learning_rate": 4.9472576628665515e-05, "loss": 0.0669, "step": 4490 }, { "epoch": 1.1774288518155054, "grad_norm": 0.5474559664726257, "learning_rate": 4.9468345196283465e-05, "loss": 0.0711, "step": 4500 }, { "epoch": 1.1800457965325482, "grad_norm": 0.8466582894325256, "learning_rate": 4.94640970400182e-05, "loss": 0.0619, "step": 4510 }, { "epoch": 1.1826627412495911, "grad_norm": 0.7379885315895081, "learning_rate": 4.9459832162773276e-05, "loss": 0.0643, "step": 4520 }, { "epoch": 1.185279685966634, "grad_norm": 0.6102809309959412, "learning_rate": 4.9455550567463724e-05, "loss": 0.0654, "step": 4530 }, { "epoch": 1.187896630683677, "grad_norm": 0.6338376998901367, "learning_rate": 4.945125225701595e-05, "loss": 0.0639, "step": 4540 }, { "epoch": 1.1905135754007197, "grad_norm": 0.6943968534469604, "learning_rate": 4.9446937234367834e-05, "loss": 0.0669, "step": 4550 }, { "epoch": 1.1931305201177624, "grad_norm": 0.5532180070877075, "learning_rate": 4.944260550246863e-05, "loss": 0.0717, "step": 4560 }, { "epoch": 1.1957474648348054, "grad_norm": 0.727317750453949, "learning_rate": 4.9438257064279046e-05, "loss": 0.061, "step": 4570 }, { "epoch": 1.1983644095518482, "grad_norm": 0.6897532343864441, "learning_rate": 4.94338919227712e-05, "loss": 0.0716, "step": 4580 }, { "epoch": 1.2009813542688912, "grad_norm": 0.9191898703575134, "learning_rate": 4.9429510080928624e-05, "loss": 0.0751, "step": 4590 }, { "epoch": 1.203598298985934, "grad_norm": 0.6276187896728516, "learning_rate": 4.942511154174626e-05, "loss": 0.0696, "step": 4600 }, { "epoch": 1.2062152437029767, "grad_norm": 0.7370391488075256, "learning_rate": 4.942069630823047e-05, "loss": 0.0706, "step": 4610 }, { "epoch": 1.2088321884200197, "grad_norm": 1.0085150003433228, "learning_rate": 4.941626438339903e-05, "loss": 0.0699, "step": 4620 }, { "epoch": 1.2114491331370625, "grad_norm": 0.8513169884681702, "learning_rate": 4.9411815770281125e-05, "loss": 0.0723, "step": 4630 }, { "epoch": 1.2140660778541053, "grad_norm": 0.7845053672790527, "learning_rate": 4.9407350471917326e-05, "loss": 0.0691, "step": 4640 }, { "epoch": 1.2166830225711482, "grad_norm": 0.5463152527809143, "learning_rate": 4.940286849135962e-05, "loss": 0.0649, "step": 4650 }, { "epoch": 1.219299967288191, "grad_norm": 0.5815234780311584, "learning_rate": 4.939836983167141e-05, "loss": 0.0652, "step": 4660 }, { "epoch": 1.2219169120052338, "grad_norm": 0.6902991533279419, "learning_rate": 4.939385449592748e-05, "loss": 0.075, "step": 4670 }, { "epoch": 1.2245338567222768, "grad_norm": 0.49586668610572815, "learning_rate": 4.938932248721402e-05, "loss": 0.068, "step": 4680 }, { "epoch": 1.2271508014393195, "grad_norm": 0.7140547633171082, "learning_rate": 4.938477380862862e-05, "loss": 0.0755, "step": 4690 }, { "epoch": 1.2297677461563625, "grad_norm": 0.799771249294281, "learning_rate": 4.938020846328026e-05, "loss": 0.0689, "step": 4700 }, { "epoch": 1.2323846908734053, "grad_norm": 0.6713014245033264, "learning_rate": 4.93756264542893e-05, "loss": 0.0694, "step": 4710 }, { "epoch": 1.235001635590448, "grad_norm": 0.8662418723106384, "learning_rate": 4.937102778478752e-05, "loss": 0.0702, "step": 4720 }, { "epoch": 1.237618580307491, "grad_norm": 0.5999091863632202, "learning_rate": 4.936641245791804e-05, "loss": 0.0694, "step": 4730 }, { "epoch": 1.2402355250245338, "grad_norm": 0.674126386642456, "learning_rate": 4.936178047683542e-05, "loss": 0.0726, "step": 4740 }, { "epoch": 1.2428524697415768, "grad_norm": 0.8433323502540588, "learning_rate": 4.935713184470555e-05, "loss": 0.0716, "step": 4750 }, { "epoch": 1.2454694144586196, "grad_norm": 1.1719890832901, "learning_rate": 4.935246656470574e-05, "loss": 0.0733, "step": 4760 }, { "epoch": 1.2480863591756624, "grad_norm": 0.7315360307693481, "learning_rate": 4.9347784640024666e-05, "loss": 0.0714, "step": 4770 }, { "epoch": 1.2507033038927053, "grad_norm": 0.7105130553245544, "learning_rate": 4.934308607386238e-05, "loss": 0.0782, "step": 4780 }, { "epoch": 1.253320248609748, "grad_norm": 0.8442756533622742, "learning_rate": 4.9338370869430294e-05, "loss": 0.0721, "step": 4790 }, { "epoch": 1.2559371933267909, "grad_norm": 0.550815224647522, "learning_rate": 4.9333639029951225e-05, "loss": 0.0648, "step": 4800 }, { "epoch": 1.2585541380438339, "grad_norm": 0.7204784750938416, "learning_rate": 4.932889055865933e-05, "loss": 0.0794, "step": 4810 }, { "epoch": 1.2611710827608766, "grad_norm": 0.738088846206665, "learning_rate": 4.932412545880014e-05, "loss": 0.0682, "step": 4820 }, { "epoch": 1.2637880274779194, "grad_norm": 0.6076630353927612, "learning_rate": 4.931934373363056e-05, "loss": 0.0715, "step": 4830 }, { "epoch": 1.2664049721949624, "grad_norm": 0.7719533443450928, "learning_rate": 4.931454538641886e-05, "loss": 0.0681, "step": 4840 }, { "epoch": 1.2690219169120052, "grad_norm": 0.8249597549438477, "learning_rate": 4.9309730420444667e-05, "loss": 0.0673, "step": 4850 }, { "epoch": 1.2716388616290482, "grad_norm": 0.6375730037689209, "learning_rate": 4.930489883899896e-05, "loss": 0.0704, "step": 4860 }, { "epoch": 1.274255806346091, "grad_norm": 0.8313435316085815, "learning_rate": 4.9300050645384065e-05, "loss": 0.0723, "step": 4870 }, { "epoch": 1.276872751063134, "grad_norm": 0.7468934655189514, "learning_rate": 4.9295185842913705e-05, "loss": 0.0692, "step": 4880 }, { "epoch": 1.2794896957801767, "grad_norm": 0.6839153170585632, "learning_rate": 4.92903044349129e-05, "loss": 0.0697, "step": 4890 }, { "epoch": 1.2821066404972195, "grad_norm": 0.5754392147064209, "learning_rate": 4.928540642471806e-05, "loss": 0.0656, "step": 4900 }, { "epoch": 1.2847235852142624, "grad_norm": 0.832221269607544, "learning_rate": 4.9280491815676925e-05, "loss": 0.0668, "step": 4910 }, { "epoch": 1.2873405299313052, "grad_norm": 0.7316128015518188, "learning_rate": 4.9275560611148587e-05, "loss": 0.0763, "step": 4920 }, { "epoch": 1.289957474648348, "grad_norm": 0.5915740132331848, "learning_rate": 4.927061281450348e-05, "loss": 0.0719, "step": 4930 }, { "epoch": 1.292574419365391, "grad_norm": 0.6216297149658203, "learning_rate": 4.926564842912337e-05, "loss": 0.0648, "step": 4940 }, { "epoch": 1.2951913640824337, "grad_norm": 0.6205251812934875, "learning_rate": 4.926066745840137e-05, "loss": 0.0736, "step": 4950 }, { "epoch": 1.2978083087994765, "grad_norm": 0.6216917037963867, "learning_rate": 4.9255669905741924e-05, "loss": 0.0678, "step": 4960 }, { "epoch": 1.3004252535165195, "grad_norm": 0.6178058981895447, "learning_rate": 4.925065577456082e-05, "loss": 0.0697, "step": 4970 }, { "epoch": 1.3030421982335623, "grad_norm": 0.7660165429115295, "learning_rate": 4.924562506828516e-05, "loss": 0.0704, "step": 4980 }, { "epoch": 1.305659142950605, "grad_norm": 0.44091275334358215, "learning_rate": 4.924057779035338e-05, "loss": 0.065, "step": 4990 }, { "epoch": 1.308276087667648, "grad_norm": 0.4674038290977478, "learning_rate": 4.9235513944215276e-05, "loss": 0.0716, "step": 5000 }, { "epoch": 1.308276087667648, "eval_loss": 0.07460379635236306, "eval_runtime": 8.5112, "eval_samples_per_second": 120.313, "eval_steps_per_second": 1.88, "step": 5000 }, { "epoch": 1.3108930323846908, "grad_norm": 0.46065524220466614, "learning_rate": 4.92304335333319e-05, "loss": 0.0669, "step": 5010 }, { "epoch": 1.3135099771017338, "grad_norm": 0.5088114738464355, "learning_rate": 4.922533656117569e-05, "loss": 0.0687, "step": 5020 }, { "epoch": 1.3161269218187766, "grad_norm": 0.573154091835022, "learning_rate": 4.922022303123037e-05, "loss": 0.0703, "step": 5030 }, { "epoch": 1.3187438665358195, "grad_norm": 0.5735311508178711, "learning_rate": 4.9215092946990994e-05, "loss": 0.0646, "step": 5040 }, { "epoch": 1.3213608112528623, "grad_norm": 0.6008470058441162, "learning_rate": 4.9209946311963926e-05, "loss": 0.0625, "step": 5050 }, { "epoch": 1.323977755969905, "grad_norm": 0.6273025274276733, "learning_rate": 4.920478312966683e-05, "loss": 0.0717, "step": 5060 }, { "epoch": 1.326594700686948, "grad_norm": 0.40354007482528687, "learning_rate": 4.919960340362871e-05, "loss": 0.0668, "step": 5070 }, { "epoch": 1.3292116454039908, "grad_norm": 0.7045165300369263, "learning_rate": 4.919440713738985e-05, "loss": 0.0682, "step": 5080 }, { "epoch": 1.3318285901210336, "grad_norm": 0.6253055334091187, "learning_rate": 4.918919433450185e-05, "loss": 0.0669, "step": 5090 }, { "epoch": 1.3344455348380766, "grad_norm": 0.6311752796173096, "learning_rate": 4.918396499852762e-05, "loss": 0.0676, "step": 5100 }, { "epoch": 1.3370624795551194, "grad_norm": 0.6613034605979919, "learning_rate": 4.9178719133041353e-05, "loss": 0.0683, "step": 5110 }, { "epoch": 1.3396794242721621, "grad_norm": 0.6443865299224854, "learning_rate": 4.9173456741628546e-05, "loss": 0.0696, "step": 5120 }, { "epoch": 1.3422963689892051, "grad_norm": 0.6435128450393677, "learning_rate": 4.9168177827885996e-05, "loss": 0.0657, "step": 5130 }, { "epoch": 1.344913313706248, "grad_norm": 0.628079891204834, "learning_rate": 4.9162882395421794e-05, "loss": 0.0733, "step": 5140 }, { "epoch": 1.3475302584232909, "grad_norm": 0.49360260367393494, "learning_rate": 4.915757044785532e-05, "loss": 0.0729, "step": 5150 }, { "epoch": 1.3501472031403337, "grad_norm": 0.49952232837677, "learning_rate": 4.9152241988817236e-05, "loss": 0.0663, "step": 5160 }, { "epoch": 1.3527641478573766, "grad_norm": 0.4128344655036926, "learning_rate": 4.91468970219495e-05, "loss": 0.0609, "step": 5170 }, { "epoch": 1.3553810925744194, "grad_norm": 0.6112388968467712, "learning_rate": 4.914153555090533e-05, "loss": 0.0612, "step": 5180 }, { "epoch": 1.3579980372914622, "grad_norm": 0.6395546197891235, "learning_rate": 4.913615757934926e-05, "loss": 0.0697, "step": 5190 }, { "epoch": 1.3606149820085052, "grad_norm": 0.6269518136978149, "learning_rate": 4.913076311095707e-05, "loss": 0.0672, "step": 5200 }, { "epoch": 1.363231926725548, "grad_norm": 0.6928361058235168, "learning_rate": 4.912535214941584e-05, "loss": 0.063, "step": 5210 }, { "epoch": 1.3658488714425907, "grad_norm": 0.541039228439331, "learning_rate": 4.91199246984239e-05, "loss": 0.0656, "step": 5220 }, { "epoch": 1.3684658161596337, "grad_norm": 0.9675135612487793, "learning_rate": 4.9114480761690865e-05, "loss": 0.0652, "step": 5230 }, { "epoch": 1.3710827608766765, "grad_norm": 0.9641793966293335, "learning_rate": 4.9109020342937614e-05, "loss": 0.0676, "step": 5240 }, { "epoch": 1.3736997055937192, "grad_norm": 0.7403941750526428, "learning_rate": 4.9103543445896296e-05, "loss": 0.0663, "step": 5250 }, { "epoch": 1.3763166503107622, "grad_norm": 0.4992590844631195, "learning_rate": 4.9098050074310316e-05, "loss": 0.0696, "step": 5260 }, { "epoch": 1.378933595027805, "grad_norm": 0.7306753993034363, "learning_rate": 4.909254023193434e-05, "loss": 0.07, "step": 5270 }, { "epoch": 1.3815505397448478, "grad_norm": 0.7848047018051147, "learning_rate": 4.9087013922534295e-05, "loss": 0.0646, "step": 5280 }, { "epoch": 1.3841674844618908, "grad_norm": 0.7645529508590698, "learning_rate": 4.908147114988736e-05, "loss": 0.0687, "step": 5290 }, { "epoch": 1.3867844291789335, "grad_norm": 0.8650171160697937, "learning_rate": 4.907591191778197e-05, "loss": 0.0702, "step": 5300 }, { "epoch": 1.3894013738959765, "grad_norm": 0.8469581604003906, "learning_rate": 4.90703362300178e-05, "loss": 0.0646, "step": 5310 }, { "epoch": 1.3920183186130193, "grad_norm": 0.7733801603317261, "learning_rate": 4.9064744090405803e-05, "loss": 0.0691, "step": 5320 }, { "epoch": 1.3946352633300623, "grad_norm": 0.8647816777229309, "learning_rate": 4.905913550276812e-05, "loss": 0.0659, "step": 5330 }, { "epoch": 1.397252208047105, "grad_norm": 0.7563399076461792, "learning_rate": 4.905351047093819e-05, "loss": 0.0667, "step": 5340 }, { "epoch": 1.3998691527641478, "grad_norm": 0.7720489501953125, "learning_rate": 4.904786899876067e-05, "loss": 0.065, "step": 5350 }, { "epoch": 1.4024860974811908, "grad_norm": 0.6422615051269531, "learning_rate": 4.904221109009144e-05, "loss": 0.068, "step": 5360 }, { "epoch": 1.4051030421982336, "grad_norm": 0.526404082775116, "learning_rate": 4.903653674879763e-05, "loss": 0.0634, "step": 5370 }, { "epoch": 1.4077199869152763, "grad_norm": 0.5538908243179321, "learning_rate": 4.9030845978757624e-05, "loss": 0.0719, "step": 5380 }, { "epoch": 1.4103369316323193, "grad_norm": 0.5565630793571472, "learning_rate": 4.902513878386097e-05, "loss": 0.0689, "step": 5390 }, { "epoch": 1.412953876349362, "grad_norm": 0.4883444011211395, "learning_rate": 4.901941516800851e-05, "loss": 0.064, "step": 5400 }, { "epoch": 1.4155708210664049, "grad_norm": 0.5586369037628174, "learning_rate": 4.9013675135112265e-05, "loss": 0.0708, "step": 5410 }, { "epoch": 1.4181877657834479, "grad_norm": 0.6188867092132568, "learning_rate": 4.90079186890955e-05, "loss": 0.073, "step": 5420 }, { "epoch": 1.4208047105004906, "grad_norm": 0.672395646572113, "learning_rate": 4.90021458338927e-05, "loss": 0.0612, "step": 5430 }, { "epoch": 1.4234216552175336, "grad_norm": 0.6133560538291931, "learning_rate": 4.899635657344954e-05, "loss": 0.062, "step": 5440 }, { "epoch": 1.4260385999345764, "grad_norm": 0.8154978156089783, "learning_rate": 4.899055091172294e-05, "loss": 0.0634, "step": 5450 }, { "epoch": 1.4286555446516194, "grad_norm": 0.6268131732940674, "learning_rate": 4.898472885268102e-05, "loss": 0.0664, "step": 5460 }, { "epoch": 1.4312724893686621, "grad_norm": 0.6232008934020996, "learning_rate": 4.8978890400303074e-05, "loss": 0.0682, "step": 5470 }, { "epoch": 1.433889434085705, "grad_norm": 0.8692842125892639, "learning_rate": 4.897303555857965e-05, "loss": 0.0687, "step": 5480 }, { "epoch": 1.436506378802748, "grad_norm": 0.7789962291717529, "learning_rate": 4.896716433151248e-05, "loss": 0.0652, "step": 5490 }, { "epoch": 1.4391233235197907, "grad_norm": 0.49732038378715515, "learning_rate": 4.896127672311448e-05, "loss": 0.0775, "step": 5500 }, { "epoch": 1.4417402682368334, "grad_norm": 0.7146192789077759, "learning_rate": 4.8955372737409786e-05, "loss": 0.0635, "step": 5510 }, { "epoch": 1.4443572129538764, "grad_norm": 0.5541284084320068, "learning_rate": 4.894945237843371e-05, "loss": 0.0717, "step": 5520 }, { "epoch": 1.4469741576709192, "grad_norm": 0.5081911087036133, "learning_rate": 4.894351565023276e-05, "loss": 0.0628, "step": 5530 }, { "epoch": 1.449591102387962, "grad_norm": 0.9104524850845337, "learning_rate": 4.893756255686465e-05, "loss": 0.0699, "step": 5540 }, { "epoch": 1.452208047105005, "grad_norm": 0.9743561148643494, "learning_rate": 4.893159310239823e-05, "loss": 0.0716, "step": 5550 }, { "epoch": 1.4548249918220477, "grad_norm": 0.7123206257820129, "learning_rate": 4.89256072909136e-05, "loss": 0.0739, "step": 5560 }, { "epoch": 1.4574419365390905, "grad_norm": 0.7350765466690063, "learning_rate": 4.891960512650199e-05, "loss": 0.0693, "step": 5570 }, { "epoch": 1.4600588812561335, "grad_norm": 0.9752265810966492, "learning_rate": 4.891358661326582e-05, "loss": 0.0746, "step": 5580 }, { "epoch": 1.4626758259731762, "grad_norm": 0.47263097763061523, "learning_rate": 4.890755175531869e-05, "loss": 0.0715, "step": 5590 }, { "epoch": 1.4652927706902192, "grad_norm": 0.9160968661308289, "learning_rate": 4.890150055678538e-05, "loss": 0.0734, "step": 5600 }, { "epoch": 1.467909715407262, "grad_norm": 0.8024541735649109, "learning_rate": 4.8895433021801804e-05, "loss": 0.0709, "step": 5610 }, { "epoch": 1.470526660124305, "grad_norm": 0.8918023109436035, "learning_rate": 4.888934915451509e-05, "loss": 0.0713, "step": 5620 }, { "epoch": 1.4731436048413478, "grad_norm": 0.5163362622261047, "learning_rate": 4.888324895908349e-05, "loss": 0.07, "step": 5630 }, { "epoch": 1.4757605495583905, "grad_norm": 0.7050215005874634, "learning_rate": 4.8877132439676427e-05, "loss": 0.0649, "step": 5640 }, { "epoch": 1.4783774942754335, "grad_norm": 0.7626054883003235, "learning_rate": 4.887099960047449e-05, "loss": 0.0709, "step": 5650 }, { "epoch": 1.4809944389924763, "grad_norm": 0.645378589630127, "learning_rate": 4.886485044566942e-05, "loss": 0.0624, "step": 5660 }, { "epoch": 1.483611383709519, "grad_norm": 0.6905105113983154, "learning_rate": 4.88586849794641e-05, "loss": 0.0727, "step": 5670 }, { "epoch": 1.486228328426562, "grad_norm": 0.7395666837692261, "learning_rate": 4.885250320607257e-05, "loss": 0.0662, "step": 5680 }, { "epoch": 1.4888452731436048, "grad_norm": 0.6373220682144165, "learning_rate": 4.884630512972001e-05, "loss": 0.0684, "step": 5690 }, { "epoch": 1.4914622178606476, "grad_norm": 0.5650008320808411, "learning_rate": 4.884009075464276e-05, "loss": 0.0575, "step": 5700 }, { "epoch": 1.4940791625776906, "grad_norm": 0.9049765467643738, "learning_rate": 4.883386008508827e-05, "loss": 0.0709, "step": 5710 }, { "epoch": 1.4966961072947333, "grad_norm": 0.6606411337852478, "learning_rate": 4.882761312531516e-05, "loss": 0.0649, "step": 5720 }, { "epoch": 1.4993130520117761, "grad_norm": 0.8762325048446655, "learning_rate": 4.882134987959315e-05, "loss": 0.0741, "step": 5730 }, { "epoch": 1.501929996728819, "grad_norm": 0.6389805674552917, "learning_rate": 4.881507035220313e-05, "loss": 0.0705, "step": 5740 }, { "epoch": 1.504546941445862, "grad_norm": 0.7237274646759033, "learning_rate": 4.880877454743708e-05, "loss": 0.0634, "step": 5750 }, { "epoch": 1.5071638861629046, "grad_norm": 0.8704226613044739, "learning_rate": 4.880246246959813e-05, "loss": 0.0666, "step": 5760 }, { "epoch": 1.5097808308799476, "grad_norm": 0.9800268411636353, "learning_rate": 4.8796134123000526e-05, "loss": 0.0665, "step": 5770 }, { "epoch": 1.5123977755969906, "grad_norm": 0.9698657989501953, "learning_rate": 4.878978951196964e-05, "loss": 0.0698, "step": 5780 }, { "epoch": 1.5150147203140334, "grad_norm": 0.4997177720069885, "learning_rate": 4.8783428640841934e-05, "loss": 0.0635, "step": 5790 }, { "epoch": 1.5176316650310762, "grad_norm": 0.4548600912094116, "learning_rate": 4.877705151396502e-05, "loss": 0.0673, "step": 5800 }, { "epoch": 1.5202486097481192, "grad_norm": 0.5970808267593384, "learning_rate": 4.877065813569761e-05, "loss": 0.0667, "step": 5810 }, { "epoch": 1.522865554465162, "grad_norm": 0.7120151519775391, "learning_rate": 4.8764248510409505e-05, "loss": 0.0652, "step": 5820 }, { "epoch": 1.5254824991822047, "grad_norm": 0.4894607365131378, "learning_rate": 4.875782264248162e-05, "loss": 0.0653, "step": 5830 }, { "epoch": 1.5280994438992477, "grad_norm": 0.6062511205673218, "learning_rate": 4.8751380536305986e-05, "loss": 0.0658, "step": 5840 }, { "epoch": 1.5307163886162904, "grad_norm": 1.0854140520095825, "learning_rate": 4.874492219628571e-05, "loss": 0.0664, "step": 5850 }, { "epoch": 1.5333333333333332, "grad_norm": 0.6886491775512695, "learning_rate": 4.8738447626835026e-05, "loss": 0.0715, "step": 5860 }, { "epoch": 1.5359502780503762, "grad_norm": 0.6680225133895874, "learning_rate": 4.873195683237922e-05, "loss": 0.0617, "step": 5870 }, { "epoch": 1.5385672227674192, "grad_norm": 0.8506457805633545, "learning_rate": 4.872544981735471e-05, "loss": 0.0669, "step": 5880 }, { "epoch": 1.5411841674844617, "grad_norm": 0.5290977358818054, "learning_rate": 4.8718926586208955e-05, "loss": 0.0603, "step": 5890 }, { "epoch": 1.5438011122015047, "grad_norm": 0.6317195892333984, "learning_rate": 4.871238714340054e-05, "loss": 0.0682, "step": 5900 }, { "epoch": 1.5464180569185477, "grad_norm": 0.8768609762191772, "learning_rate": 4.8705831493399106e-05, "loss": 0.0677, "step": 5910 }, { "epoch": 1.5490350016355905, "grad_norm": 0.8061237335205078, "learning_rate": 4.869925964068538e-05, "loss": 0.0678, "step": 5920 }, { "epoch": 1.5516519463526333, "grad_norm": 0.5746549367904663, "learning_rate": 4.869267158975116e-05, "loss": 0.0694, "step": 5930 }, { "epoch": 1.5542688910696763, "grad_norm": 0.6497983336448669, "learning_rate": 4.868606734509932e-05, "loss": 0.0682, "step": 5940 }, { "epoch": 1.556885835786719, "grad_norm": 0.751058042049408, "learning_rate": 4.8679446911243783e-05, "loss": 0.0691, "step": 5950 }, { "epoch": 1.5595027805037618, "grad_norm": 0.5331182479858398, "learning_rate": 4.867281029270958e-05, "loss": 0.067, "step": 5960 }, { "epoch": 1.5621197252208048, "grad_norm": 0.6946902871131897, "learning_rate": 4.866615749403276e-05, "loss": 0.0652, "step": 5970 }, { "epoch": 1.5647366699378475, "grad_norm": 0.581596314907074, "learning_rate": 4.865948851976044e-05, "loss": 0.0648, "step": 5980 }, { "epoch": 1.5673536146548903, "grad_norm": 0.8583618402481079, "learning_rate": 4.865280337445083e-05, "loss": 0.0655, "step": 5990 }, { "epoch": 1.5699705593719333, "grad_norm": 0.4247710704803467, "learning_rate": 4.864610206267314e-05, "loss": 0.0633, "step": 6000 }, { "epoch": 1.5699705593719333, "eval_loss": 0.07125227067872947, "eval_runtime": 8.5376, "eval_samples_per_second": 119.939, "eval_steps_per_second": 1.874, "step": 6000 }, { "epoch": 1.572587504088976, "grad_norm": 0.7412828207015991, "learning_rate": 4.863938458900765e-05, "loss": 0.0688, "step": 6010 }, { "epoch": 1.5752044488060188, "grad_norm": 0.7785113453865051, "learning_rate": 4.863265095804571e-05, "loss": 0.0679, "step": 6020 }, { "epoch": 1.5778213935230618, "grad_norm": 0.7265563607215881, "learning_rate": 4.8625901174389685e-05, "loss": 0.0704, "step": 6030 }, { "epoch": 1.5804383382401048, "grad_norm": 0.6599150896072388, "learning_rate": 4.861913524265298e-05, "loss": 0.0705, "step": 6040 }, { "epoch": 1.5830552829571474, "grad_norm": 0.4436973035335541, "learning_rate": 4.8612353167460054e-05, "loss": 0.0657, "step": 6050 }, { "epoch": 1.5856722276741904, "grad_norm": 0.6332281231880188, "learning_rate": 4.860555495344639e-05, "loss": 0.0671, "step": 6060 }, { "epoch": 1.5882891723912334, "grad_norm": 0.5939836502075195, "learning_rate": 4.8598740605258494e-05, "loss": 0.0679, "step": 6070 }, { "epoch": 1.5909061171082761, "grad_norm": 0.8490774035453796, "learning_rate": 4.8591910127553925e-05, "loss": 0.0749, "step": 6080 }, { "epoch": 1.5935230618253189, "grad_norm": 0.7159541845321655, "learning_rate": 4.858506352500124e-05, "loss": 0.0675, "step": 6090 }, { "epoch": 1.5961400065423619, "grad_norm": 0.5728774070739746, "learning_rate": 4.857820080228003e-05, "loss": 0.0693, "step": 6100 }, { "epoch": 1.5987569512594046, "grad_norm": 0.7324960231781006, "learning_rate": 4.8571321964080904e-05, "loss": 0.0771, "step": 6110 }, { "epoch": 1.6013738959764474, "grad_norm": 1.0330255031585693, "learning_rate": 4.856442701510548e-05, "loss": 0.073, "step": 6120 }, { "epoch": 1.6039908406934904, "grad_norm": 0.634896457195282, "learning_rate": 4.855751596006638e-05, "loss": 0.0693, "step": 6130 }, { "epoch": 1.6066077854105332, "grad_norm": 0.46513810753822327, "learning_rate": 4.855058880368727e-05, "loss": 0.0694, "step": 6140 }, { "epoch": 1.609224730127576, "grad_norm": 0.6653358340263367, "learning_rate": 4.854364555070277e-05, "loss": 0.0648, "step": 6150 }, { "epoch": 1.611841674844619, "grad_norm": 0.6717187762260437, "learning_rate": 4.8536686205858545e-05, "loss": 0.0647, "step": 6160 }, { "epoch": 1.614458619561662, "grad_norm": 0.521497905254364, "learning_rate": 4.852971077391123e-05, "loss": 0.0612, "step": 6170 }, { "epoch": 1.6170755642787045, "grad_norm": 0.6874605417251587, "learning_rate": 4.852271925962848e-05, "loss": 0.063, "step": 6180 }, { "epoch": 1.6196925089957475, "grad_norm": 0.5139410495758057, "learning_rate": 4.851571166778892e-05, "loss": 0.064, "step": 6190 }, { "epoch": 1.6223094537127905, "grad_norm": 0.5792190432548523, "learning_rate": 4.850868800318218e-05, "loss": 0.072, "step": 6200 }, { "epoch": 1.6249263984298332, "grad_norm": 0.6859474778175354, "learning_rate": 4.8501648270608854e-05, "loss": 0.0716, "step": 6210 }, { "epoch": 1.627543343146876, "grad_norm": 0.6160419583320618, "learning_rate": 4.8494592474880544e-05, "loss": 0.0669, "step": 6220 }, { "epoch": 1.630160287863919, "grad_norm": 0.7906918525695801, "learning_rate": 4.848752062081982e-05, "loss": 0.0634, "step": 6230 }, { "epoch": 1.6327772325809617, "grad_norm": 0.47414660453796387, "learning_rate": 4.8480432713260226e-05, "loss": 0.0709, "step": 6240 }, { "epoch": 1.6353941772980045, "grad_norm": 0.6333305239677429, "learning_rate": 4.847332875704628e-05, "loss": 0.0666, "step": 6250 }, { "epoch": 1.6380111220150475, "grad_norm": 0.6713711619377136, "learning_rate": 4.846620875703347e-05, "loss": 0.0659, "step": 6260 }, { "epoch": 1.6406280667320903, "grad_norm": 0.844963550567627, "learning_rate": 4.845907271808825e-05, "loss": 0.062, "step": 6270 }, { "epoch": 1.643245011449133, "grad_norm": 0.7952145338058472, "learning_rate": 4.8451920645088025e-05, "loss": 0.0627, "step": 6280 }, { "epoch": 1.645861956166176, "grad_norm": 0.9742829203605652, "learning_rate": 4.8444752542921186e-05, "loss": 0.0707, "step": 6290 }, { "epoch": 1.6484789008832188, "grad_norm": 0.8199586272239685, "learning_rate": 4.843756841648705e-05, "loss": 0.0658, "step": 6300 }, { "epoch": 1.6510958456002616, "grad_norm": 0.8059208989143372, "learning_rate": 4.84303682706959e-05, "loss": 0.0659, "step": 6310 }, { "epoch": 1.6537127903173046, "grad_norm": 0.7988175749778748, "learning_rate": 4.842315211046898e-05, "loss": 0.0644, "step": 6320 }, { "epoch": 1.6563297350343476, "grad_norm": 0.8034182786941528, "learning_rate": 4.8415919940738464e-05, "loss": 0.0664, "step": 6330 }, { "epoch": 1.65894667975139, "grad_norm": 0.7371190786361694, "learning_rate": 4.8408671766447456e-05, "loss": 0.0648, "step": 6340 }, { "epoch": 1.661563624468433, "grad_norm": 0.9478877782821655, "learning_rate": 4.840140759255003e-05, "loss": 0.0757, "step": 6350 }, { "epoch": 1.664180569185476, "grad_norm": 0.7560123205184937, "learning_rate": 4.839412742401118e-05, "loss": 0.0673, "step": 6360 }, { "epoch": 1.6667975139025188, "grad_norm": 1.1563645601272583, "learning_rate": 4.838683126580683e-05, "loss": 0.068, "step": 6370 }, { "epoch": 1.6694144586195616, "grad_norm": 0.8656620979309082, "learning_rate": 4.8379519122923825e-05, "loss": 0.0667, "step": 6380 }, { "epoch": 1.6720314033366046, "grad_norm": 0.7312747240066528, "learning_rate": 4.8372191000359955e-05, "loss": 0.0666, "step": 6390 }, { "epoch": 1.6746483480536474, "grad_norm": 0.8066172003746033, "learning_rate": 4.836484690312393e-05, "loss": 0.0692, "step": 6400 }, { "epoch": 1.6772652927706901, "grad_norm": 0.8848495483398438, "learning_rate": 4.8357486836235365e-05, "loss": 0.0754, "step": 6410 }, { "epoch": 1.6798822374877331, "grad_norm": 0.6636145114898682, "learning_rate": 4.8350110804724794e-05, "loss": 0.0721, "step": 6420 }, { "epoch": 1.682499182204776, "grad_norm": 0.6970325708389282, "learning_rate": 4.834271881363367e-05, "loss": 0.0644, "step": 6430 }, { "epoch": 1.6851161269218187, "grad_norm": 0.6332338452339172, "learning_rate": 4.833531086801434e-05, "loss": 0.0663, "step": 6440 }, { "epoch": 1.6877330716388617, "grad_norm": 0.703227698802948, "learning_rate": 4.832788697293007e-05, "loss": 0.0641, "step": 6450 }, { "epoch": 1.6903500163559044, "grad_norm": 0.8867107033729553, "learning_rate": 4.832044713345503e-05, "loss": 0.0703, "step": 6460 }, { "epoch": 1.6929669610729472, "grad_norm": 0.7940611243247986, "learning_rate": 4.831299135467426e-05, "loss": 0.0632, "step": 6470 }, { "epoch": 1.6955839057899902, "grad_norm": 0.7674934267997742, "learning_rate": 4.830551964168374e-05, "loss": 0.0648, "step": 6480 }, { "epoch": 1.6982008505070332, "grad_norm": 0.6497439742088318, "learning_rate": 4.829803199959029e-05, "loss": 0.0697, "step": 6490 }, { "epoch": 1.700817795224076, "grad_norm": 0.6304329633712769, "learning_rate": 4.829052843351167e-05, "loss": 0.0657, "step": 6500 }, { "epoch": 1.7034347399411187, "grad_norm": 0.621741533279419, "learning_rate": 4.828300894857647e-05, "loss": 0.0727, "step": 6510 }, { "epoch": 1.7060516846581617, "grad_norm": 0.7226573824882507, "learning_rate": 4.827547354992421e-05, "loss": 0.065, "step": 6520 }, { "epoch": 1.7086686293752045, "grad_norm": 0.9050750732421875, "learning_rate": 4.826792224270524e-05, "loss": 0.0662, "step": 6530 }, { "epoch": 1.7112855740922472, "grad_norm": 0.659066915512085, "learning_rate": 4.826035503208083e-05, "loss": 0.0584, "step": 6540 }, { "epoch": 1.7139025188092902, "grad_norm": 0.580033004283905, "learning_rate": 4.825277192322309e-05, "loss": 0.0682, "step": 6550 }, { "epoch": 1.716519463526333, "grad_norm": 0.7463915348052979, "learning_rate": 4.8245172921315e-05, "loss": 0.067, "step": 6560 }, { "epoch": 1.7191364082433758, "grad_norm": 0.6378515958786011, "learning_rate": 4.82375580315504e-05, "loss": 0.0655, "step": 6570 }, { "epoch": 1.7217533529604188, "grad_norm": 0.4390958845615387, "learning_rate": 4.8229927259134014e-05, "loss": 0.0582, "step": 6580 }, { "epoch": 1.7243702976774615, "grad_norm": 0.4259874224662781, "learning_rate": 4.8222280609281376e-05, "loss": 0.0666, "step": 6590 }, { "epoch": 1.7269872423945043, "grad_norm": 0.4031057357788086, "learning_rate": 4.821461808721892e-05, "loss": 0.0666, "step": 6600 }, { "epoch": 1.7296041871115473, "grad_norm": 0.9084675312042236, "learning_rate": 4.820693969818391e-05, "loss": 0.0707, "step": 6610 }, { "epoch": 1.7322211318285903, "grad_norm": 0.6418400406837463, "learning_rate": 4.819924544742444e-05, "loss": 0.0741, "step": 6620 }, { "epoch": 1.7348380765456328, "grad_norm": 0.7338866591453552, "learning_rate": 4.8191535340199464e-05, "loss": 0.068, "step": 6630 }, { "epoch": 1.7374550212626758, "grad_norm": 0.6052848696708679, "learning_rate": 4.8183809381778765e-05, "loss": 0.0727, "step": 6640 }, { "epoch": 1.7400719659797188, "grad_norm": 0.7197098731994629, "learning_rate": 4.8176067577442964e-05, "loss": 0.0707, "step": 6650 }, { "epoch": 1.7426889106967616, "grad_norm": 0.7238598465919495, "learning_rate": 4.816830993248351e-05, "loss": 0.0613, "step": 6660 }, { "epoch": 1.7453058554138043, "grad_norm": 0.6141265630722046, "learning_rate": 4.8160536452202673e-05, "loss": 0.0694, "step": 6670 }, { "epoch": 1.7479228001308473, "grad_norm": 0.6801651120185852, "learning_rate": 4.815274714191357e-05, "loss": 0.0627, "step": 6680 }, { "epoch": 1.75053974484789, "grad_norm": 0.8193888664245605, "learning_rate": 4.814494200694012e-05, "loss": 0.0653, "step": 6690 }, { "epoch": 1.7531566895649329, "grad_norm": 0.6166638731956482, "learning_rate": 4.813712105261704e-05, "loss": 0.0601, "step": 6700 }, { "epoch": 1.7557736342819759, "grad_norm": 0.7112689018249512, "learning_rate": 4.81292842842899e-05, "loss": 0.0708, "step": 6710 }, { "epoch": 1.7583905789990186, "grad_norm": 0.5664924383163452, "learning_rate": 4.812143170731504e-05, "loss": 0.0718, "step": 6720 }, { "epoch": 1.7610075237160614, "grad_norm": 0.5493945479393005, "learning_rate": 4.811356332705963e-05, "loss": 0.0711, "step": 6730 }, { "epoch": 1.7636244684331044, "grad_norm": 0.8429237604141235, "learning_rate": 4.810567914890164e-05, "loss": 0.063, "step": 6740 }, { "epoch": 1.7662414131501472, "grad_norm": 0.6945361495018005, "learning_rate": 4.809777917822982e-05, "loss": 0.0738, "step": 6750 }, { "epoch": 1.76885835786719, "grad_norm": 0.5371482968330383, "learning_rate": 4.808986342044374e-05, "loss": 0.0673, "step": 6760 }, { "epoch": 1.771475302584233, "grad_norm": 0.7007372975349426, "learning_rate": 4.8081931880953726e-05, "loss": 0.0667, "step": 6770 }, { "epoch": 1.774092247301276, "grad_norm": 0.5731061100959778, "learning_rate": 4.807398456518092e-05, "loss": 0.0664, "step": 6780 }, { "epoch": 1.7767091920183185, "grad_norm": 0.845836341381073, "learning_rate": 4.806602147855725e-05, "loss": 0.0655, "step": 6790 }, { "epoch": 1.7793261367353614, "grad_norm": 0.6107457280158997, "learning_rate": 4.805804262652539e-05, "loss": 0.0666, "step": 6800 }, { "epoch": 1.7819430814524044, "grad_norm": 1.2480820417404175, "learning_rate": 4.805004801453882e-05, "loss": 0.0662, "step": 6810 }, { "epoch": 1.7845600261694472, "grad_norm": 0.5964035391807556, "learning_rate": 4.8042037648061784e-05, "loss": 0.0733, "step": 6820 }, { "epoch": 1.78717697088649, "grad_norm": 0.8358713388442993, "learning_rate": 4.803401153256929e-05, "loss": 0.0672, "step": 6830 }, { "epoch": 1.789793915603533, "grad_norm": 0.7865737676620483, "learning_rate": 4.802596967354711e-05, "loss": 0.0649, "step": 6840 }, { "epoch": 1.7924108603205757, "grad_norm": 0.6207346320152283, "learning_rate": 4.801791207649177e-05, "loss": 0.072, "step": 6850 }, { "epoch": 1.7950278050376185, "grad_norm": 0.5982876420021057, "learning_rate": 4.800983874691058e-05, "loss": 0.0646, "step": 6860 }, { "epoch": 1.7976447497546615, "grad_norm": 0.6681666374206543, "learning_rate": 4.800174969032158e-05, "loss": 0.0691, "step": 6870 }, { "epoch": 1.8002616944717043, "grad_norm": 0.6014638543128967, "learning_rate": 4.799364491225356e-05, "loss": 0.0693, "step": 6880 }, { "epoch": 1.802878639188747, "grad_norm": 0.5622276067733765, "learning_rate": 4.7985524418246054e-05, "loss": 0.065, "step": 6890 }, { "epoch": 1.80549558390579, "grad_norm": 0.567729115486145, "learning_rate": 4.797738821384935e-05, "loss": 0.0679, "step": 6900 }, { "epoch": 1.808112528622833, "grad_norm": 0.6296717524528503, "learning_rate": 4.796923630462446e-05, "loss": 0.0728, "step": 6910 }, { "epoch": 1.8107294733398756, "grad_norm": 0.8331077098846436, "learning_rate": 4.796106869614315e-05, "loss": 0.0687, "step": 6920 }, { "epoch": 1.8133464180569185, "grad_norm": 0.7066493034362793, "learning_rate": 4.79528853939879e-05, "loss": 0.0654, "step": 6930 }, { "epoch": 1.8159633627739615, "grad_norm": 0.8010251522064209, "learning_rate": 4.794468640375191e-05, "loss": 0.0625, "step": 6940 }, { "epoch": 1.8185803074910043, "grad_norm": 0.633176863193512, "learning_rate": 4.793647173103912e-05, "loss": 0.0665, "step": 6950 }, { "epoch": 1.821197252208047, "grad_norm": 0.9934688806533813, "learning_rate": 4.792824138146418e-05, "loss": 0.0671, "step": 6960 }, { "epoch": 1.82381419692509, "grad_norm": 0.4914017617702484, "learning_rate": 4.791999536065246e-05, "loss": 0.0662, "step": 6970 }, { "epoch": 1.8264311416421328, "grad_norm": 0.6178949475288391, "learning_rate": 4.791173367424002e-05, "loss": 0.0692, "step": 6980 }, { "epoch": 1.8290480863591756, "grad_norm": 0.805231511592865, "learning_rate": 4.790345632787367e-05, "loss": 0.0623, "step": 6990 }, { "epoch": 1.8316650310762186, "grad_norm": 0.5471294522285461, "learning_rate": 4.789516332721089e-05, "loss": 0.0594, "step": 7000 }, { "epoch": 1.8316650310762186, "eval_loss": 0.07114872934573006, "eval_runtime": 8.5303, "eval_samples_per_second": 120.043, "eval_steps_per_second": 1.876, "step": 7000 }, { "epoch": 1.8342819757932614, "grad_norm": 0.7086074948310852, "learning_rate": 4.7886854677919856e-05, "loss": 0.0649, "step": 7010 }, { "epoch": 1.8368989205103041, "grad_norm": 0.5640758275985718, "learning_rate": 4.7878530385679466e-05, "loss": 0.0599, "step": 7020 }, { "epoch": 1.8395158652273471, "grad_norm": 0.598824679851532, "learning_rate": 4.7870190456179284e-05, "loss": 0.0636, "step": 7030 }, { "epoch": 1.8421328099443899, "grad_norm": 0.5776214003562927, "learning_rate": 4.786183489511958e-05, "loss": 0.0677, "step": 7040 }, { "epoch": 1.8447497546614327, "grad_norm": 0.6187028288841248, "learning_rate": 4.78534637082113e-05, "loss": 0.0607, "step": 7050 }, { "epoch": 1.8473666993784756, "grad_norm": 0.5059308409690857, "learning_rate": 4.784507690117607e-05, "loss": 0.0614, "step": 7060 }, { "epoch": 1.8499836440955186, "grad_norm": 0.6319045424461365, "learning_rate": 4.783667447974619e-05, "loss": 0.069, "step": 7070 }, { "epoch": 1.8526005888125612, "grad_norm": 0.7948163747787476, "learning_rate": 4.782825644966464e-05, "loss": 0.0598, "step": 7080 }, { "epoch": 1.8552175335296042, "grad_norm": 0.5553324818611145, "learning_rate": 4.781982281668508e-05, "loss": 0.0652, "step": 7090 }, { "epoch": 1.8578344782466472, "grad_norm": 0.49766290187835693, "learning_rate": 4.781137358657179e-05, "loss": 0.0638, "step": 7100 }, { "epoch": 1.86045142296369, "grad_norm": 0.4596664309501648, "learning_rate": 4.780290876509975e-05, "loss": 0.0672, "step": 7110 }, { "epoch": 1.8630683676807327, "grad_norm": 0.5042189359664917, "learning_rate": 4.779442835805459e-05, "loss": 0.065, "step": 7120 }, { "epoch": 1.8656853123977757, "grad_norm": 0.6542985439300537, "learning_rate": 4.7785932371232586e-05, "loss": 0.07, "step": 7130 }, { "epoch": 1.8683022571148185, "grad_norm": 0.5698999762535095, "learning_rate": 4.7777420810440666e-05, "loss": 0.062, "step": 7140 }, { "epoch": 1.8709192018318612, "grad_norm": 0.48582544922828674, "learning_rate": 4.7768893681496394e-05, "loss": 0.0571, "step": 7150 }, { "epoch": 1.8735361465489042, "grad_norm": 0.7398406863212585, "learning_rate": 4.7760350990227995e-05, "loss": 0.065, "step": 7160 }, { "epoch": 1.876153091265947, "grad_norm": 0.5045884251594543, "learning_rate": 4.7751792742474317e-05, "loss": 0.0668, "step": 7170 }, { "epoch": 1.8787700359829898, "grad_norm": 0.6144871711730957, "learning_rate": 4.774321894408483e-05, "loss": 0.0683, "step": 7180 }, { "epoch": 1.8813869807000327, "grad_norm": 0.7012941837310791, "learning_rate": 4.7734629600919645e-05, "loss": 0.0678, "step": 7190 }, { "epoch": 1.8840039254170755, "grad_norm": 0.5341391563415527, "learning_rate": 4.772602471884951e-05, "loss": 0.0619, "step": 7200 }, { "epoch": 1.8866208701341183, "grad_norm": 0.761932909488678, "learning_rate": 4.7717404303755775e-05, "loss": 0.0647, "step": 7210 }, { "epoch": 1.8892378148511613, "grad_norm": 0.7804669737815857, "learning_rate": 4.7708768361530405e-05, "loss": 0.0579, "step": 7220 }, { "epoch": 1.8918547595682043, "grad_norm": 0.505577564239502, "learning_rate": 4.770011689807599e-05, "loss": 0.0692, "step": 7230 }, { "epoch": 1.894471704285247, "grad_norm": 0.6581014394760132, "learning_rate": 4.769144991930573e-05, "loss": 0.0695, "step": 7240 }, { "epoch": 1.8970886490022898, "grad_norm": 0.9491140842437744, "learning_rate": 4.7682767431143416e-05, "loss": 0.063, "step": 7250 }, { "epoch": 1.8997055937193328, "grad_norm": 0.6348735690116882, "learning_rate": 4.7674069439523445e-05, "loss": 0.06, "step": 7260 }, { "epoch": 1.9023225384363756, "grad_norm": 0.48946839570999146, "learning_rate": 4.766535595039082e-05, "loss": 0.0673, "step": 7270 }, { "epoch": 1.9049394831534183, "grad_norm": 1.8328133821487427, "learning_rate": 4.7656626969701124e-05, "loss": 0.0674, "step": 7280 }, { "epoch": 1.9075564278704613, "grad_norm": 0.7238545417785645, "learning_rate": 4.7647882503420526e-05, "loss": 0.0686, "step": 7290 }, { "epoch": 1.910173372587504, "grad_norm": 0.5402998328208923, "learning_rate": 4.76391225575258e-05, "loss": 0.0605, "step": 7300 }, { "epoch": 1.9127903173045468, "grad_norm": 0.9406218528747559, "learning_rate": 4.7630347138004285e-05, "loss": 0.0645, "step": 7310 }, { "epoch": 1.9154072620215898, "grad_norm": 0.729573130607605, "learning_rate": 4.762155625085388e-05, "loss": 0.0749, "step": 7320 }, { "epoch": 1.9180242067386326, "grad_norm": 0.5499677062034607, "learning_rate": 4.7612749902083095e-05, "loss": 0.0639, "step": 7330 }, { "epoch": 1.9206411514556754, "grad_norm": 0.7047156691551208, "learning_rate": 4.760392809771098e-05, "loss": 0.059, "step": 7340 }, { "epoch": 1.9232580961727184, "grad_norm": 0.4866229295730591, "learning_rate": 4.759509084376714e-05, "loss": 0.0595, "step": 7350 }, { "epoch": 1.9258750408897614, "grad_norm": 1.0535590648651123, "learning_rate": 4.7586238146291785e-05, "loss": 0.0725, "step": 7360 }, { "epoch": 1.928491985606804, "grad_norm": 1.2395167350769043, "learning_rate": 4.757737001133562e-05, "loss": 0.0693, "step": 7370 }, { "epoch": 1.931108930323847, "grad_norm": 0.5386394262313843, "learning_rate": 4.7568486444959945e-05, "loss": 0.0679, "step": 7380 }, { "epoch": 1.9337258750408899, "grad_norm": 0.6242868900299072, "learning_rate": 4.75595874532366e-05, "loss": 0.0708, "step": 7390 }, { "epoch": 1.9363428197579327, "grad_norm": 0.6938721537590027, "learning_rate": 4.755067304224795e-05, "loss": 0.0597, "step": 7400 }, { "epoch": 1.9389597644749754, "grad_norm": 0.4725117087364197, "learning_rate": 4.754174321808691e-05, "loss": 0.0585, "step": 7410 }, { "epoch": 1.9415767091920184, "grad_norm": 0.5444639325141907, "learning_rate": 4.753279798685695e-05, "loss": 0.0686, "step": 7420 }, { "epoch": 1.9441936539090612, "grad_norm": 0.48581692576408386, "learning_rate": 4.752383735467202e-05, "loss": 0.0645, "step": 7430 }, { "epoch": 1.946810598626104, "grad_norm": 0.643066942691803, "learning_rate": 4.751486132765666e-05, "loss": 0.062, "step": 7440 }, { "epoch": 1.949427543343147, "grad_norm": 0.34584930539131165, "learning_rate": 4.750586991194588e-05, "loss": 0.0612, "step": 7450 }, { "epoch": 1.9520444880601897, "grad_norm": 0.7048997282981873, "learning_rate": 4.749686311368523e-05, "loss": 0.0678, "step": 7460 }, { "epoch": 1.9546614327772325, "grad_norm": 0.7045301198959351, "learning_rate": 4.748784093903078e-05, "loss": 0.0656, "step": 7470 }, { "epoch": 1.9572783774942755, "grad_norm": 0.6391457319259644, "learning_rate": 4.7478803394149094e-05, "loss": 0.0628, "step": 7480 }, { "epoch": 1.9598953222113182, "grad_norm": 0.699043869972229, "learning_rate": 4.746975048521725e-05, "loss": 0.0575, "step": 7490 }, { "epoch": 1.962512266928361, "grad_norm": 0.7100141048431396, "learning_rate": 4.746068221842282e-05, "loss": 0.0649, "step": 7500 }, { "epoch": 1.965129211645404, "grad_norm": 0.6790652871131897, "learning_rate": 4.74515985999639e-05, "loss": 0.0617, "step": 7510 }, { "epoch": 1.967746156362447, "grad_norm": 0.7839459180831909, "learning_rate": 4.744249963604903e-05, "loss": 0.0668, "step": 7520 }, { "epoch": 1.9703631010794895, "grad_norm": 0.7605068683624268, "learning_rate": 4.743338533289728e-05, "loss": 0.0641, "step": 7530 }, { "epoch": 1.9729800457965325, "grad_norm": 0.6718177795410156, "learning_rate": 4.7424255696738195e-05, "loss": 0.0615, "step": 7540 }, { "epoch": 1.9755969905135755, "grad_norm": 0.47757506370544434, "learning_rate": 4.741511073381179e-05, "loss": 0.0606, "step": 7550 }, { "epoch": 1.9782139352306183, "grad_norm": 0.7829891443252563, "learning_rate": 4.740595045036855e-05, "loss": 0.073, "step": 7560 }, { "epoch": 1.980830879947661, "grad_norm": 0.4041369557380676, "learning_rate": 4.739677485266946e-05, "loss": 0.0623, "step": 7570 }, { "epoch": 1.983447824664704, "grad_norm": 0.46687090396881104, "learning_rate": 4.7387583946985946e-05, "loss": 0.0581, "step": 7580 }, { "epoch": 1.9860647693817468, "grad_norm": 0.5236268639564514, "learning_rate": 4.7378377739599914e-05, "loss": 0.0573, "step": 7590 }, { "epoch": 1.9886817140987896, "grad_norm": 0.46081119775772095, "learning_rate": 4.73691562368037e-05, "loss": 0.0662, "step": 7600 }, { "epoch": 1.9912986588158326, "grad_norm": 0.764456033706665, "learning_rate": 4.735991944490014e-05, "loss": 0.0593, "step": 7610 }, { "epoch": 1.9939156035328753, "grad_norm": 0.5772601962089539, "learning_rate": 4.735066737020247e-05, "loss": 0.0623, "step": 7620 }, { "epoch": 1.996532548249918, "grad_norm": 0.5460513234138489, "learning_rate": 4.734140001903441e-05, "loss": 0.0633, "step": 7630 }, { "epoch": 1.999149492966961, "grad_norm": 0.8035596609115601, "learning_rate": 4.73321173977301e-05, "loss": 0.0666, "step": 7640 }, { "epoch": 2.0015701668302257, "grad_norm": 0.638861358165741, "learning_rate": 4.732281951263413e-05, "loss": 0.0611, "step": 7650 }, { "epoch": 2.0041871115472687, "grad_norm": 0.455998033285141, "learning_rate": 4.7313506370101515e-05, "loss": 0.0681, "step": 7660 }, { "epoch": 2.0068040562643112, "grad_norm": 0.7048136591911316, "learning_rate": 4.73041779764977e-05, "loss": 0.0731, "step": 7670 }, { "epoch": 2.0094210009813542, "grad_norm": 0.5252156257629395, "learning_rate": 4.729483433819856e-05, "loss": 0.0605, "step": 7680 }, { "epoch": 2.0120379456983972, "grad_norm": 0.6412255167961121, "learning_rate": 4.728547546159037e-05, "loss": 0.0623, "step": 7690 }, { "epoch": 2.0146548904154398, "grad_norm": 0.8844165205955505, "learning_rate": 4.7276101353069843e-05, "loss": 0.0688, "step": 7700 }, { "epoch": 2.0172718351324828, "grad_norm": 0.4701058864593506, "learning_rate": 4.72667120190441e-05, "loss": 0.0675, "step": 7710 }, { "epoch": 2.0198887798495258, "grad_norm": 0.5482307076454163, "learning_rate": 4.7257307465930644e-05, "loss": 0.0628, "step": 7720 }, { "epoch": 2.0225057245665687, "grad_norm": 0.7286236882209778, "learning_rate": 4.724788770015741e-05, "loss": 0.0613, "step": 7730 }, { "epoch": 2.0251226692836113, "grad_norm": 0.49437442421913147, "learning_rate": 4.723845272816272e-05, "loss": 0.0611, "step": 7740 }, { "epoch": 2.0277396140006543, "grad_norm": 0.4415714740753174, "learning_rate": 4.722900255639529e-05, "loss": 0.0639, "step": 7750 }, { "epoch": 2.0303565587176973, "grad_norm": 0.4924696683883667, "learning_rate": 4.721953719131422e-05, "loss": 0.0647, "step": 7760 }, { "epoch": 2.03297350343474, "grad_norm": 0.4595443606376648, "learning_rate": 4.721005663938899e-05, "loss": 0.0686, "step": 7770 }, { "epoch": 2.035590448151783, "grad_norm": 0.6738218069076538, "learning_rate": 4.7200560907099476e-05, "loss": 0.0614, "step": 7780 }, { "epoch": 2.038207392868826, "grad_norm": 0.7287262082099915, "learning_rate": 4.719105000093593e-05, "loss": 0.06, "step": 7790 }, { "epoch": 2.0408243375858683, "grad_norm": 0.43699556589126587, "learning_rate": 4.718152392739895e-05, "loss": 0.0668, "step": 7800 }, { "epoch": 2.0434412823029113, "grad_norm": 0.8281365633010864, "learning_rate": 4.717198269299953e-05, "loss": 0.0621, "step": 7810 }, { "epoch": 2.0460582270199543, "grad_norm": 0.626327633857727, "learning_rate": 4.7162426304259e-05, "loss": 0.056, "step": 7820 }, { "epoch": 2.048675171736997, "grad_norm": 0.49688369035720825, "learning_rate": 4.715285476770908e-05, "loss": 0.0634, "step": 7830 }, { "epoch": 2.05129211645404, "grad_norm": 0.4797987937927246, "learning_rate": 4.714326808989181e-05, "loss": 0.062, "step": 7840 }, { "epoch": 2.053909061171083, "grad_norm": 0.4971781075000763, "learning_rate": 4.71336662773596e-05, "loss": 0.0617, "step": 7850 }, { "epoch": 2.0565260058881254, "grad_norm": 0.5491329431533813, "learning_rate": 4.71240493366752e-05, "loss": 0.0654, "step": 7860 }, { "epoch": 2.0591429506051684, "grad_norm": 0.5640793442726135, "learning_rate": 4.711441727441169e-05, "loss": 0.0598, "step": 7870 }, { "epoch": 2.0617598953222114, "grad_norm": 0.7192339301109314, "learning_rate": 4.71047700971525e-05, "loss": 0.0633, "step": 7880 }, { "epoch": 2.0643768400392544, "grad_norm": 0.5545005202293396, "learning_rate": 4.709510781149139e-05, "loss": 0.0686, "step": 7890 }, { "epoch": 2.066993784756297, "grad_norm": 0.7110175490379333, "learning_rate": 4.708543042403243e-05, "loss": 0.0642, "step": 7900 }, { "epoch": 2.06961072947334, "grad_norm": 0.5228858590126038, "learning_rate": 4.707573794139003e-05, "loss": 0.0693, "step": 7910 }, { "epoch": 2.072227674190383, "grad_norm": 0.4876290261745453, "learning_rate": 4.706603037018891e-05, "loss": 0.0619, "step": 7920 }, { "epoch": 2.0748446189074254, "grad_norm": 0.5120744705200195, "learning_rate": 4.7056307717064094e-05, "loss": 0.0605, "step": 7930 }, { "epoch": 2.0774615636244684, "grad_norm": 0.6547344326972961, "learning_rate": 4.704656998866094e-05, "loss": 0.0674, "step": 7940 }, { "epoch": 2.0800785083415114, "grad_norm": 0.7206751108169556, "learning_rate": 4.703681719163509e-05, "loss": 0.0669, "step": 7950 }, { "epoch": 2.082695453058554, "grad_norm": 0.6250445246696472, "learning_rate": 4.702704933265248e-05, "loss": 0.0661, "step": 7960 }, { "epoch": 2.085312397775597, "grad_norm": 0.39483657479286194, "learning_rate": 4.701726641838935e-05, "loss": 0.0721, "step": 7970 }, { "epoch": 2.08792934249264, "grad_norm": 0.8020210266113281, "learning_rate": 4.700746845553223e-05, "loss": 0.0721, "step": 7980 }, { "epoch": 2.0905462872096825, "grad_norm": 0.7316377758979797, "learning_rate": 4.699765545077795e-05, "loss": 0.0663, "step": 7990 }, { "epoch": 2.0931632319267255, "grad_norm": 0.6087160706520081, "learning_rate": 4.698782741083359e-05, "loss": 0.0655, "step": 8000 }, { "epoch": 2.0931632319267255, "eval_loss": 0.06413635517794444, "eval_runtime": 8.6044, "eval_samples_per_second": 119.01, "eval_steps_per_second": 1.86, "step": 8000 }, { "epoch": 2.0957801766437685, "grad_norm": 0.7263542413711548, "learning_rate": 4.6977984342416524e-05, "loss": 0.06, "step": 8010 }, { "epoch": 2.0983971213608115, "grad_norm": 0.6284407377243042, "learning_rate": 4.696812625225441e-05, "loss": 0.0647, "step": 8020 }, { "epoch": 2.101014066077854, "grad_norm": 0.594032347202301, "learning_rate": 4.695825314708514e-05, "loss": 0.0657, "step": 8030 }, { "epoch": 2.103631010794897, "grad_norm": 0.6179496645927429, "learning_rate": 4.69483650336569e-05, "loss": 0.0645, "step": 8040 }, { "epoch": 2.10624795551194, "grad_norm": 0.8740331530570984, "learning_rate": 4.693846191872812e-05, "loss": 0.0669, "step": 8050 }, { "epoch": 2.1088649002289825, "grad_norm": 0.4465997517108917, "learning_rate": 4.692854380906748e-05, "loss": 0.0605, "step": 8060 }, { "epoch": 2.1114818449460255, "grad_norm": 0.46578449010849, "learning_rate": 4.6918610711453936e-05, "loss": 0.0636, "step": 8070 }, { "epoch": 2.1140987896630685, "grad_norm": 0.6705744862556458, "learning_rate": 4.690866263267664e-05, "loss": 0.0654, "step": 8080 }, { "epoch": 2.116715734380111, "grad_norm": 0.5965505242347717, "learning_rate": 4.689869957953502e-05, "loss": 0.0643, "step": 8090 }, { "epoch": 2.119332679097154, "grad_norm": 0.5712394714355469, "learning_rate": 4.688872155883873e-05, "loss": 0.0656, "step": 8100 }, { "epoch": 2.121949623814197, "grad_norm": 0.9391905665397644, "learning_rate": 4.687872857740766e-05, "loss": 0.0591, "step": 8110 }, { "epoch": 2.1245665685312396, "grad_norm": 0.769233763217926, "learning_rate": 4.686872064207191e-05, "loss": 0.0639, "step": 8120 }, { "epoch": 2.1271835132482826, "grad_norm": 0.7062404751777649, "learning_rate": 4.6858697759671796e-05, "loss": 0.0663, "step": 8130 }, { "epoch": 2.1298004579653256, "grad_norm": 0.59275883436203, "learning_rate": 4.68486599370579e-05, "loss": 0.0619, "step": 8140 }, { "epoch": 2.1324174026823686, "grad_norm": 0.5705215930938721, "learning_rate": 4.683860718109094e-05, "loss": 0.0648, "step": 8150 }, { "epoch": 2.135034347399411, "grad_norm": 0.6727226972579956, "learning_rate": 4.6828539498641913e-05, "loss": 0.0676, "step": 8160 }, { "epoch": 2.137651292116454, "grad_norm": 0.5288172364234924, "learning_rate": 4.6818456896591956e-05, "loss": 0.0647, "step": 8170 }, { "epoch": 2.140268236833497, "grad_norm": 0.7117987871170044, "learning_rate": 4.6808359381832456e-05, "loss": 0.0589, "step": 8180 }, { "epoch": 2.1428851815505396, "grad_norm": 0.41700848937034607, "learning_rate": 4.679824696126495e-05, "loss": 0.062, "step": 8190 }, { "epoch": 2.1455021262675826, "grad_norm": 0.7077834606170654, "learning_rate": 4.67881196418012e-05, "loss": 0.0647, "step": 8200 }, { "epoch": 2.1481190709846256, "grad_norm": 0.4548968970775604, "learning_rate": 4.677797743036312e-05, "loss": 0.0592, "step": 8210 }, { "epoch": 2.150736015701668, "grad_norm": 0.6429287791252136, "learning_rate": 4.6767820333882815e-05, "loss": 0.0576, "step": 8220 }, { "epoch": 2.153352960418711, "grad_norm": 0.6636003255844116, "learning_rate": 4.675764835930258e-05, "loss": 0.0579, "step": 8230 }, { "epoch": 2.155969905135754, "grad_norm": 0.5616292953491211, "learning_rate": 4.6747461513574845e-05, "loss": 0.0662, "step": 8240 }, { "epoch": 2.1585868498527967, "grad_norm": 0.4538356065750122, "learning_rate": 4.6737259803662236e-05, "loss": 0.0701, "step": 8250 }, { "epoch": 2.1612037945698397, "grad_norm": 0.6878339648246765, "learning_rate": 4.672704323653753e-05, "loss": 0.0592, "step": 8260 }, { "epoch": 2.1638207392868827, "grad_norm": 0.586982786655426, "learning_rate": 4.671681181918363e-05, "loss": 0.0652, "step": 8270 }, { "epoch": 2.1664376840039252, "grad_norm": 0.5167317390441895, "learning_rate": 4.670656555859364e-05, "loss": 0.064, "step": 8280 }, { "epoch": 2.169054628720968, "grad_norm": 0.42640289664268494, "learning_rate": 4.6696304461770765e-05, "loss": 0.0645, "step": 8290 }, { "epoch": 2.171671573438011, "grad_norm": 0.41746556758880615, "learning_rate": 4.668602853572838e-05, "loss": 0.0609, "step": 8300 }, { "epoch": 2.1742885181550538, "grad_norm": 0.7875781059265137, "learning_rate": 4.667573778748997e-05, "loss": 0.0604, "step": 8310 }, { "epoch": 2.1769054628720967, "grad_norm": 0.9152212738990784, "learning_rate": 4.6665432224089176e-05, "loss": 0.063, "step": 8320 }, { "epoch": 2.1795224075891397, "grad_norm": 0.7127177715301514, "learning_rate": 4.6655111852569754e-05, "loss": 0.0668, "step": 8330 }, { "epoch": 2.1821393523061827, "grad_norm": 0.5886343717575073, "learning_rate": 4.664477667998557e-05, "loss": 0.0617, "step": 8340 }, { "epoch": 2.1847562970232253, "grad_norm": 0.4000236988067627, "learning_rate": 4.6634426713400625e-05, "loss": 0.0554, "step": 8350 }, { "epoch": 2.1873732417402683, "grad_norm": 0.6061784625053406, "learning_rate": 4.662406195988903e-05, "loss": 0.0614, "step": 8360 }, { "epoch": 2.1899901864573112, "grad_norm": 0.5097105503082275, "learning_rate": 4.6613682426534975e-05, "loss": 0.0573, "step": 8370 }, { "epoch": 2.192607131174354, "grad_norm": 0.6182939410209656, "learning_rate": 4.66032881204328e-05, "loss": 0.0626, "step": 8380 }, { "epoch": 2.195224075891397, "grad_norm": 0.5252171158790588, "learning_rate": 4.6592879048686886e-05, "loss": 0.0637, "step": 8390 }, { "epoch": 2.1978410206084398, "grad_norm": 0.6885454058647156, "learning_rate": 4.6582455218411755e-05, "loss": 0.064, "step": 8400 }, { "epoch": 2.2004579653254823, "grad_norm": 0.6499890685081482, "learning_rate": 4.6572016636732e-05, "loss": 0.064, "step": 8410 }, { "epoch": 2.2030749100425253, "grad_norm": 0.3553354740142822, "learning_rate": 4.656156331078229e-05, "loss": 0.062, "step": 8420 }, { "epoch": 2.2056918547595683, "grad_norm": 0.711338460445404, "learning_rate": 4.6551095247707354e-05, "loss": 0.0612, "step": 8430 }, { "epoch": 2.208308799476611, "grad_norm": 0.7364850044250488, "learning_rate": 4.6540612454662044e-05, "loss": 0.0631, "step": 8440 }, { "epoch": 2.210925744193654, "grad_norm": 0.5707940459251404, "learning_rate": 4.653011493881123e-05, "loss": 0.0714, "step": 8450 }, { "epoch": 2.213542688910697, "grad_norm": 0.8141592144966125, "learning_rate": 4.651960270732987e-05, "loss": 0.0649, "step": 8460 }, { "epoch": 2.21615963362774, "grad_norm": 0.6246472597122192, "learning_rate": 4.650907576740299e-05, "loss": 0.0592, "step": 8470 }, { "epoch": 2.2187765783447824, "grad_norm": 0.5534509420394897, "learning_rate": 4.649853412622563e-05, "loss": 0.0614, "step": 8480 }, { "epoch": 2.2213935230618254, "grad_norm": 1.4337730407714844, "learning_rate": 4.6487977791002914e-05, "loss": 0.0613, "step": 8490 }, { "epoch": 2.2240104677788683, "grad_norm": 0.5923467874526978, "learning_rate": 4.647740676895001e-05, "loss": 0.0657, "step": 8500 }, { "epoch": 2.226627412495911, "grad_norm": 0.7094632983207703, "learning_rate": 4.646682106729208e-05, "loss": 0.067, "step": 8510 }, { "epoch": 2.229244357212954, "grad_norm": 0.5530036091804504, "learning_rate": 4.645622069326439e-05, "loss": 0.0584, "step": 8520 }, { "epoch": 2.231861301929997, "grad_norm": 0.5729954838752747, "learning_rate": 4.6445605654112156e-05, "loss": 0.064, "step": 8530 }, { "epoch": 2.2344782466470394, "grad_norm": 0.736095130443573, "learning_rate": 4.6434975957090686e-05, "loss": 0.0708, "step": 8540 }, { "epoch": 2.2370951913640824, "grad_norm": 0.5560267567634583, "learning_rate": 4.642433160946528e-05, "loss": 0.0572, "step": 8550 }, { "epoch": 2.2397121360811254, "grad_norm": 0.7600628733634949, "learning_rate": 4.641367261851122e-05, "loss": 0.0676, "step": 8560 }, { "epoch": 2.242329080798168, "grad_norm": 0.8181315660476685, "learning_rate": 4.6402998991513855e-05, "loss": 0.0745, "step": 8570 }, { "epoch": 2.244946025515211, "grad_norm": 0.6807851791381836, "learning_rate": 4.6392310735768495e-05, "loss": 0.0576, "step": 8580 }, { "epoch": 2.247562970232254, "grad_norm": 0.6910809278488159, "learning_rate": 4.638160785858047e-05, "loss": 0.0641, "step": 8590 }, { "epoch": 2.250179914949297, "grad_norm": 0.7639349102973938, "learning_rate": 4.637089036726508e-05, "loss": 0.0634, "step": 8600 }, { "epoch": 2.2527968596663395, "grad_norm": 0.6723827123641968, "learning_rate": 4.636015826914765e-05, "loss": 0.0658, "step": 8610 }, { "epoch": 2.2554138043833825, "grad_norm": 0.5631839036941528, "learning_rate": 4.634941157156345e-05, "loss": 0.0585, "step": 8620 }, { "epoch": 2.258030749100425, "grad_norm": 0.5604969263076782, "learning_rate": 4.6338650281857756e-05, "loss": 0.0636, "step": 8630 }, { "epoch": 2.260647693817468, "grad_norm": 0.6197567582130432, "learning_rate": 4.6327874407385805e-05, "loss": 0.0615, "step": 8640 }, { "epoch": 2.263264638534511, "grad_norm": 0.5681231617927551, "learning_rate": 4.631708395551281e-05, "loss": 0.0673, "step": 8650 }, { "epoch": 2.265881583251554, "grad_norm": 0.5107356905937195, "learning_rate": 4.630627893361393e-05, "loss": 0.0592, "step": 8660 }, { "epoch": 2.2684985279685965, "grad_norm": 0.7415857315063477, "learning_rate": 4.629545934907432e-05, "loss": 0.066, "step": 8670 }, { "epoch": 2.2711154726856395, "grad_norm": 0.527061402797699, "learning_rate": 4.6284625209289037e-05, "loss": 0.0675, "step": 8680 }, { "epoch": 2.2737324174026825, "grad_norm": 0.626235842704773, "learning_rate": 4.627377652166313e-05, "loss": 0.0561, "step": 8690 }, { "epoch": 2.276349362119725, "grad_norm": 0.5250177979469299, "learning_rate": 4.6262913293611567e-05, "loss": 0.0543, "step": 8700 }, { "epoch": 2.278966306836768, "grad_norm": 0.6569236516952515, "learning_rate": 4.6252035532559266e-05, "loss": 0.065, "step": 8710 }, { "epoch": 2.281583251553811, "grad_norm": 0.8803039193153381, "learning_rate": 4.6241143245941076e-05, "loss": 0.065, "step": 8720 }, { "epoch": 2.284200196270854, "grad_norm": 0.7301013469696045, "learning_rate": 4.623023644120177e-05, "loss": 0.0571, "step": 8730 }, { "epoch": 2.2868171409878966, "grad_norm": 0.44893473386764526, "learning_rate": 4.621931512579604e-05, "loss": 0.0664, "step": 8740 }, { "epoch": 2.2894340857049396, "grad_norm": 0.6927379965782166, "learning_rate": 4.620837930718852e-05, "loss": 0.0591, "step": 8750 }, { "epoch": 2.292051030421982, "grad_norm": 0.4672520160675049, "learning_rate": 4.619742899285371e-05, "loss": 0.0572, "step": 8760 }, { "epoch": 2.294667975139025, "grad_norm": 0.9525076150894165, "learning_rate": 4.6186464190276076e-05, "loss": 0.0697, "step": 8770 }, { "epoch": 2.297284919856068, "grad_norm": 0.5259708166122437, "learning_rate": 4.617548490694994e-05, "loss": 0.0638, "step": 8780 }, { "epoch": 2.299901864573111, "grad_norm": 0.53282231092453, "learning_rate": 4.616449115037954e-05, "loss": 0.0666, "step": 8790 }, { "epoch": 2.3025188092901536, "grad_norm": 0.6173849105834961, "learning_rate": 4.6153482928079006e-05, "loss": 0.0585, "step": 8800 }, { "epoch": 2.3051357540071966, "grad_norm": 0.6748325228691101, "learning_rate": 4.614246024757237e-05, "loss": 0.0545, "step": 8810 }, { "epoch": 2.3077526987242396, "grad_norm": 0.8505156636238098, "learning_rate": 4.61314231163935e-05, "loss": 0.065, "step": 8820 }, { "epoch": 2.310369643441282, "grad_norm": 0.6660551428794861, "learning_rate": 4.612037154208619e-05, "loss": 0.0623, "step": 8830 }, { "epoch": 2.312986588158325, "grad_norm": 0.43942147493362427, "learning_rate": 4.610930553220409e-05, "loss": 0.0636, "step": 8840 }, { "epoch": 2.315603532875368, "grad_norm": 0.7198824286460876, "learning_rate": 4.609822509431071e-05, "loss": 0.0681, "step": 8850 }, { "epoch": 2.3182204775924107, "grad_norm": 1.167868971824646, "learning_rate": 4.608713023597941e-05, "loss": 0.0617, "step": 8860 }, { "epoch": 2.3208374223094537, "grad_norm": 0.40410086512565613, "learning_rate": 4.607602096479345e-05, "loss": 0.0652, "step": 8870 }, { "epoch": 2.3234543670264967, "grad_norm": 0.8580119013786316, "learning_rate": 4.606489728834589e-05, "loss": 0.0601, "step": 8880 }, { "epoch": 2.326071311743539, "grad_norm": 0.5383431315422058, "learning_rate": 4.6053759214239654e-05, "loss": 0.0653, "step": 8890 }, { "epoch": 2.328688256460582, "grad_norm": 0.8677040934562683, "learning_rate": 4.604260675008753e-05, "loss": 0.0604, "step": 8900 }, { "epoch": 2.331305201177625, "grad_norm": 0.5791143178939819, "learning_rate": 4.603143990351211e-05, "loss": 0.0634, "step": 8910 }, { "epoch": 2.333922145894668, "grad_norm": 1.2247254848480225, "learning_rate": 4.602025868214583e-05, "loss": 0.0683, "step": 8920 }, { "epoch": 2.3365390906117107, "grad_norm": 0.6355758905410767, "learning_rate": 4.600906309363095e-05, "loss": 0.0747, "step": 8930 }, { "epoch": 2.3391560353287537, "grad_norm": 0.5947376489639282, "learning_rate": 4.599785314561955e-05, "loss": 0.0615, "step": 8940 }, { "epoch": 2.3417729800457967, "grad_norm": 0.5728808045387268, "learning_rate": 4.598662884577352e-05, "loss": 0.0636, "step": 8950 }, { "epoch": 2.3443899247628393, "grad_norm": 0.4856645464897156, "learning_rate": 4.597539020176457e-05, "loss": 0.0632, "step": 8960 }, { "epoch": 2.3470068694798822, "grad_norm": 1.0914742946624756, "learning_rate": 4.5964137221274195e-05, "loss": 0.0693, "step": 8970 }, { "epoch": 2.3496238141969252, "grad_norm": 0.5342974066734314, "learning_rate": 4.595286991199372e-05, "loss": 0.0624, "step": 8980 }, { "epoch": 2.3522407589139678, "grad_norm": 0.7614314556121826, "learning_rate": 4.5941588281624226e-05, "loss": 0.0677, "step": 8990 }, { "epoch": 2.3548577036310108, "grad_norm": 0.4510788023471832, "learning_rate": 4.593029233787661e-05, "loss": 0.0646, "step": 9000 }, { "epoch": 2.3548577036310108, "eval_loss": 0.06636923542647824, "eval_runtime": 8.6164, "eval_samples_per_second": 118.843, "eval_steps_per_second": 1.857, "step": 9000 }, { "epoch": 2.3574746483480538, "grad_norm": 0.5777572989463806, "learning_rate": 4.5918982088471544e-05, "loss": 0.0708, "step": 9010 }, { "epoch": 2.3600915930650963, "grad_norm": 0.6823755502700806, "learning_rate": 4.5907657541139484e-05, "loss": 0.0645, "step": 9020 }, { "epoch": 2.3627085377821393, "grad_norm": 0.5664687156677246, "learning_rate": 4.5896318703620626e-05, "loss": 0.0686, "step": 9030 }, { "epoch": 2.3653254824991823, "grad_norm": 0.5852131247520447, "learning_rate": 4.588496558366498e-05, "loss": 0.0567, "step": 9040 }, { "epoch": 2.3679424272162253, "grad_norm": 0.5248322486877441, "learning_rate": 4.58735981890323e-05, "loss": 0.058, "step": 9050 }, { "epoch": 2.370559371933268, "grad_norm": 0.46559926867485046, "learning_rate": 4.586221652749207e-05, "loss": 0.0648, "step": 9060 }, { "epoch": 2.373176316650311, "grad_norm": 0.6136702299118042, "learning_rate": 4.585082060682357e-05, "loss": 0.0611, "step": 9070 }, { "epoch": 2.375793261367354, "grad_norm": 0.729214072227478, "learning_rate": 4.583941043481579e-05, "loss": 0.0652, "step": 9080 }, { "epoch": 2.3784102060843963, "grad_norm": 0.5812839269638062, "learning_rate": 4.5827986019267496e-05, "loss": 0.0608, "step": 9090 }, { "epoch": 2.3810271508014393, "grad_norm": 0.6233426928520203, "learning_rate": 4.581654736798714e-05, "loss": 0.0654, "step": 9100 }, { "epoch": 2.3836440955184823, "grad_norm": 0.7243742942810059, "learning_rate": 4.5805094488792956e-05, "loss": 0.0563, "step": 9110 }, { "epoch": 2.386261040235525, "grad_norm": 0.7165740728378296, "learning_rate": 4.579362738951286e-05, "loss": 0.0666, "step": 9120 }, { "epoch": 2.388877984952568, "grad_norm": 0.6373894810676575, "learning_rate": 4.5782146077984523e-05, "loss": 0.0639, "step": 9130 }, { "epoch": 2.391494929669611, "grad_norm": 0.4645223319530487, "learning_rate": 4.577065056205531e-05, "loss": 0.0592, "step": 9140 }, { "epoch": 2.3941118743866534, "grad_norm": 0.6769869327545166, "learning_rate": 4.5759140849582276e-05, "loss": 0.0659, "step": 9150 }, { "epoch": 2.3967288191036964, "grad_norm": 0.6146237850189209, "learning_rate": 4.574761694843222e-05, "loss": 0.0603, "step": 9160 }, { "epoch": 2.3993457638207394, "grad_norm": 0.7659674882888794, "learning_rate": 4.5736078866481634e-05, "loss": 0.0601, "step": 9170 }, { "epoch": 2.4019627085377824, "grad_norm": 0.8440955281257629, "learning_rate": 4.572452661161667e-05, "loss": 0.059, "step": 9180 }, { "epoch": 2.404579653254825, "grad_norm": 0.6863587498664856, "learning_rate": 4.571296019173318e-05, "loss": 0.0648, "step": 9190 }, { "epoch": 2.407196597971868, "grad_norm": 0.41940245032310486, "learning_rate": 4.5701379614736715e-05, "loss": 0.0717, "step": 9200 }, { "epoch": 2.4098135426889105, "grad_norm": 0.9120506048202515, "learning_rate": 4.568978488854248e-05, "loss": 0.0664, "step": 9210 }, { "epoch": 2.4124304874059534, "grad_norm": 0.38293400406837463, "learning_rate": 4.567817602107537e-05, "loss": 0.0611, "step": 9220 }, { "epoch": 2.4150474321229964, "grad_norm": 0.8800689578056335, "learning_rate": 4.566655302026993e-05, "loss": 0.0677, "step": 9230 }, { "epoch": 2.4176643768400394, "grad_norm": 0.5821717381477356, "learning_rate": 4.5654915894070384e-05, "loss": 0.0567, "step": 9240 }, { "epoch": 2.420281321557082, "grad_norm": 0.6287429928779602, "learning_rate": 4.564326465043058e-05, "loss": 0.0649, "step": 9250 }, { "epoch": 2.422898266274125, "grad_norm": 0.3861464560031891, "learning_rate": 4.563159929731404e-05, "loss": 0.0621, "step": 9260 }, { "epoch": 2.425515210991168, "grad_norm": 0.9000930190086365, "learning_rate": 4.5619919842693935e-05, "loss": 0.0663, "step": 9270 }, { "epoch": 2.4281321557082105, "grad_norm": 0.7210332155227661, "learning_rate": 4.5608226294553044e-05, "loss": 0.0635, "step": 9280 }, { "epoch": 2.4307491004252535, "grad_norm": 1.0638445615768433, "learning_rate": 4.559651866088381e-05, "loss": 0.0595, "step": 9290 }, { "epoch": 2.4333660451422965, "grad_norm": 0.6221959590911865, "learning_rate": 4.558479694968828e-05, "loss": 0.0697, "step": 9300 }, { "epoch": 2.435982989859339, "grad_norm": 0.46246451139450073, "learning_rate": 4.557306116897814e-05, "loss": 0.0624, "step": 9310 }, { "epoch": 2.438599934576382, "grad_norm": 0.43070271611213684, "learning_rate": 4.556131132677468e-05, "loss": 0.0614, "step": 9320 }, { "epoch": 2.441216879293425, "grad_norm": 0.44614386558532715, "learning_rate": 4.554954743110881e-05, "loss": 0.0582, "step": 9330 }, { "epoch": 2.4438338240104676, "grad_norm": 0.6269716024398804, "learning_rate": 4.553776949002104e-05, "loss": 0.0694, "step": 9340 }, { "epoch": 2.4464507687275105, "grad_norm": 0.7049176692962646, "learning_rate": 4.552597751156149e-05, "loss": 0.0636, "step": 9350 }, { "epoch": 2.4490677134445535, "grad_norm": 0.6448350548744202, "learning_rate": 4.551417150378986e-05, "loss": 0.0635, "step": 9360 }, { "epoch": 2.4516846581615965, "grad_norm": 0.5977907180786133, "learning_rate": 4.550235147477544e-05, "loss": 0.0566, "step": 9370 }, { "epoch": 2.454301602878639, "grad_norm": 0.5693519115447998, "learning_rate": 4.5490517432597115e-05, "loss": 0.0568, "step": 9380 }, { "epoch": 2.456918547595682, "grad_norm": 0.6188995838165283, "learning_rate": 4.547866938534333e-05, "loss": 0.06, "step": 9390 }, { "epoch": 2.459535492312725, "grad_norm": 0.5704085230827332, "learning_rate": 4.546680734111213e-05, "loss": 0.0623, "step": 9400 }, { "epoch": 2.4621524370297676, "grad_norm": 0.5484632849693298, "learning_rate": 4.5454931308011106e-05, "loss": 0.0634, "step": 9410 }, { "epoch": 2.4647693817468106, "grad_norm": 0.5368353724479675, "learning_rate": 4.544304129415741e-05, "loss": 0.0601, "step": 9420 }, { "epoch": 2.4673863264638536, "grad_norm": 0.5148277878761292, "learning_rate": 4.543113730767775e-05, "loss": 0.0569, "step": 9430 }, { "epoch": 2.470003271180896, "grad_norm": 0.5295160412788391, "learning_rate": 4.5419219356708396e-05, "loss": 0.0606, "step": 9440 }, { "epoch": 2.472620215897939, "grad_norm": 0.5751885175704956, "learning_rate": 4.540728744939515e-05, "loss": 0.0599, "step": 9450 }, { "epoch": 2.475237160614982, "grad_norm": 0.5783774256706238, "learning_rate": 4.539534159389337e-05, "loss": 0.0637, "step": 9460 }, { "epoch": 2.4778541053320247, "grad_norm": 0.582295298576355, "learning_rate": 4.538338179836793e-05, "loss": 0.0646, "step": 9470 }, { "epoch": 2.4804710500490676, "grad_norm": 0.7552310824394226, "learning_rate": 4.5371408070993225e-05, "loss": 0.0597, "step": 9480 }, { "epoch": 2.4830879947661106, "grad_norm": 0.5856291651725769, "learning_rate": 4.535942041995319e-05, "loss": 0.0605, "step": 9490 }, { "epoch": 2.4857049394831536, "grad_norm": 0.7458827495574951, "learning_rate": 4.5347418853441295e-05, "loss": 0.0526, "step": 9500 }, { "epoch": 2.488321884200196, "grad_norm": 0.40024372935295105, "learning_rate": 4.533540337966046e-05, "loss": 0.0575, "step": 9510 }, { "epoch": 2.490938828917239, "grad_norm": 0.5775712132453918, "learning_rate": 4.532337400682317e-05, "loss": 0.0626, "step": 9520 }, { "epoch": 2.493555773634282, "grad_norm": 0.5049796104431152, "learning_rate": 4.531133074315139e-05, "loss": 0.0545, "step": 9530 }, { "epoch": 2.4961727183513247, "grad_norm": 0.46351227164268494, "learning_rate": 4.529927359687657e-05, "loss": 0.0591, "step": 9540 }, { "epoch": 2.4987896630683677, "grad_norm": 0.37615618109703064, "learning_rate": 4.528720257623966e-05, "loss": 0.056, "step": 9550 }, { "epoch": 2.5014066077854107, "grad_norm": 0.37017613649368286, "learning_rate": 4.5275117689491076e-05, "loss": 0.0597, "step": 9560 }, { "epoch": 2.5040235525024532, "grad_norm": 0.3615386486053467, "learning_rate": 4.5263018944890744e-05, "loss": 0.0575, "step": 9570 }, { "epoch": 2.506640497219496, "grad_norm": 0.48490145802497864, "learning_rate": 4.525090635070803e-05, "loss": 0.0646, "step": 9580 }, { "epoch": 2.509257441936539, "grad_norm": 0.5488083958625793, "learning_rate": 4.523877991522178e-05, "loss": 0.0597, "step": 9590 }, { "epoch": 2.5118743866535818, "grad_norm": 1.2636692523956299, "learning_rate": 4.522663964672029e-05, "loss": 0.0613, "step": 9600 }, { "epoch": 2.5144913313706247, "grad_norm": 0.6536663174629211, "learning_rate": 4.521448555350134e-05, "loss": 0.0558, "step": 9610 }, { "epoch": 2.5171082760876677, "grad_norm": 0.36221152544021606, "learning_rate": 4.5202317643872114e-05, "loss": 0.0604, "step": 9620 }, { "epoch": 2.5197252208047107, "grad_norm": 0.465022474527359, "learning_rate": 4.519013592614928e-05, "loss": 0.0659, "step": 9630 }, { "epoch": 2.5223421655217533, "grad_norm": 0.6671952605247498, "learning_rate": 4.517794040865892e-05, "loss": 0.0611, "step": 9640 }, { "epoch": 2.5249591102387963, "grad_norm": 0.35949820280075073, "learning_rate": 4.516573109973656e-05, "loss": 0.0654, "step": 9650 }, { "epoch": 2.527576054955839, "grad_norm": 0.5386552214622498, "learning_rate": 4.5153508007727145e-05, "loss": 0.0571, "step": 9660 }, { "epoch": 2.530192999672882, "grad_norm": 0.6097580790519714, "learning_rate": 4.5141271140985044e-05, "loss": 0.0578, "step": 9670 }, { "epoch": 2.532809944389925, "grad_norm": 0.48718369007110596, "learning_rate": 4.512902050787404e-05, "loss": 0.058, "step": 9680 }, { "epoch": 2.535426889106968, "grad_norm": 0.49066928029060364, "learning_rate": 4.5116756116767315e-05, "loss": 0.059, "step": 9690 }, { "epoch": 2.5380438338240103, "grad_norm": 0.5355244874954224, "learning_rate": 4.510447797604749e-05, "loss": 0.0644, "step": 9700 }, { "epoch": 2.5406607785410533, "grad_norm": 0.3920019268989563, "learning_rate": 4.509218609410652e-05, "loss": 0.0604, "step": 9710 }, { "epoch": 2.5432777232580963, "grad_norm": 0.5329289436340332, "learning_rate": 4.507988047934583e-05, "loss": 0.0576, "step": 9720 }, { "epoch": 2.545894667975139, "grad_norm": 0.5136492252349854, "learning_rate": 4.5067561140176176e-05, "loss": 0.0588, "step": 9730 }, { "epoch": 2.548511612692182, "grad_norm": 0.6208662390708923, "learning_rate": 4.50552280850177e-05, "loss": 0.0664, "step": 9740 }, { "epoch": 2.551128557409225, "grad_norm": 0.695162832736969, "learning_rate": 4.5042881322299936e-05, "loss": 0.0621, "step": 9750 }, { "epoch": 2.553745502126268, "grad_norm": 0.9550657868385315, "learning_rate": 4.5030520860461784e-05, "loss": 0.0622, "step": 9760 }, { "epoch": 2.5563624468433104, "grad_norm": 0.6386085152626038, "learning_rate": 4.50181467079515e-05, "loss": 0.0595, "step": 9770 }, { "epoch": 2.5589793915603534, "grad_norm": 0.4597232937812805, "learning_rate": 4.50057588732267e-05, "loss": 0.0606, "step": 9780 }, { "epoch": 2.561596336277396, "grad_norm": 0.6518101692199707, "learning_rate": 4.499335736475436e-05, "loss": 0.065, "step": 9790 }, { "epoch": 2.564213280994439, "grad_norm": 0.5159201622009277, "learning_rate": 4.498094219101078e-05, "loss": 0.0539, "step": 9800 }, { "epoch": 2.566830225711482, "grad_norm": 0.4551909565925598, "learning_rate": 4.496851336048162e-05, "loss": 0.0544, "step": 9810 }, { "epoch": 2.569447170428525, "grad_norm": 0.6590986847877502, "learning_rate": 4.495607088166188e-05, "loss": 0.0581, "step": 9820 }, { "epoch": 2.5720641151455674, "grad_norm": 0.616459310054779, "learning_rate": 4.494361476305586e-05, "loss": 0.056, "step": 9830 }, { "epoch": 2.5746810598626104, "grad_norm": 0.4650562107563019, "learning_rate": 4.493114501317721e-05, "loss": 0.0632, "step": 9840 }, { "epoch": 2.5772980045796534, "grad_norm": 0.44526204466819763, "learning_rate": 4.4918661640548874e-05, "loss": 0.0559, "step": 9850 }, { "epoch": 2.579914949296696, "grad_norm": 0.5463991165161133, "learning_rate": 4.4906164653703134e-05, "loss": 0.0629, "step": 9860 }, { "epoch": 2.582531894013739, "grad_norm": 0.8113076686859131, "learning_rate": 4.4893654061181563e-05, "loss": 0.0574, "step": 9870 }, { "epoch": 2.585148838730782, "grad_norm": 0.7694985866546631, "learning_rate": 4.488112987153502e-05, "loss": 0.0662, "step": 9880 }, { "epoch": 2.587765783447825, "grad_norm": 0.490842342376709, "learning_rate": 4.486859209332368e-05, "loss": 0.0676, "step": 9890 }, { "epoch": 2.5903827281648675, "grad_norm": 0.6306349039077759, "learning_rate": 4.4856040735116986e-05, "loss": 0.0585, "step": 9900 }, { "epoch": 2.5929996728819105, "grad_norm": 0.8197053670883179, "learning_rate": 4.4843475805493696e-05, "loss": 0.0598, "step": 9910 }, { "epoch": 2.595616617598953, "grad_norm": 0.6163144707679749, "learning_rate": 4.48308973130418e-05, "loss": 0.0636, "step": 9920 }, { "epoch": 2.598233562315996, "grad_norm": 0.48077651858329773, "learning_rate": 4.481830526635858e-05, "loss": 0.0681, "step": 9930 }, { "epoch": 2.600850507033039, "grad_norm": 0.580900251865387, "learning_rate": 4.4805699674050585e-05, "loss": 0.0636, "step": 9940 }, { "epoch": 2.603467451750082, "grad_norm": 0.7182990312576294, "learning_rate": 4.4793080544733626e-05, "loss": 0.0622, "step": 9950 }, { "epoch": 2.6060843964671245, "grad_norm": 0.6701587438583374, "learning_rate": 4.478044788703275e-05, "loss": 0.056, "step": 9960 }, { "epoch": 2.6087013411841675, "grad_norm": 0.5928230881690979, "learning_rate": 4.476780170958226e-05, "loss": 0.0538, "step": 9970 }, { "epoch": 2.61131828590121, "grad_norm": 0.6294201016426086, "learning_rate": 4.47551420210257e-05, "loss": 0.0613, "step": 9980 }, { "epoch": 2.613935230618253, "grad_norm": 0.5394532680511475, "learning_rate": 4.474246883001585e-05, "loss": 0.0596, "step": 9990 }, { "epoch": 2.616552175335296, "grad_norm": 0.7629094123840332, "learning_rate": 4.4729782145214716e-05, "loss": 0.0631, "step": 10000 }, { "epoch": 2.616552175335296, "eval_loss": 0.06285145155637001, "eval_runtime": 8.6089, "eval_samples_per_second": 118.947, "eval_steps_per_second": 1.859, "step": 10000 }, { "epoch": 2.619169120052339, "grad_norm": 0.4276679456233978, "learning_rate": 4.471708197529352e-05, "loss": 0.0603, "step": 10010 }, { "epoch": 2.6217860647693816, "grad_norm": 0.862521231174469, "learning_rate": 4.470436832893272e-05, "loss": 0.0584, "step": 10020 }, { "epoch": 2.6244030094864246, "grad_norm": 0.8724349141120911, "learning_rate": 4.469164121482197e-05, "loss": 0.0618, "step": 10030 }, { "epoch": 2.6270199542034676, "grad_norm": 0.4335499107837677, "learning_rate": 4.467890064166013e-05, "loss": 0.0577, "step": 10040 }, { "epoch": 2.62963689892051, "grad_norm": 0.7104797959327698, "learning_rate": 4.466614661815526e-05, "loss": 0.0599, "step": 10050 }, { "epoch": 2.632253843637553, "grad_norm": 0.6808757781982422, "learning_rate": 4.4653379153024624e-05, "loss": 0.0546, "step": 10060 }, { "epoch": 2.634870788354596, "grad_norm": 0.3997107744216919, "learning_rate": 4.464059825499465e-05, "loss": 0.0579, "step": 10070 }, { "epoch": 2.637487733071639, "grad_norm": 0.6047708988189697, "learning_rate": 4.462780393280097e-05, "loss": 0.0648, "step": 10080 }, { "epoch": 2.6401046777886816, "grad_norm": 0.5319182276725769, "learning_rate": 4.461499619518838e-05, "loss": 0.0609, "step": 10090 }, { "epoch": 2.6427216225057246, "grad_norm": 0.6906712651252747, "learning_rate": 4.460217505091086e-05, "loss": 0.0565, "step": 10100 }, { "epoch": 2.645338567222767, "grad_norm": 0.6555960178375244, "learning_rate": 4.458934050873151e-05, "loss": 0.0641, "step": 10110 }, { "epoch": 2.64795551193981, "grad_norm": 0.41678696870803833, "learning_rate": 4.457649257742265e-05, "loss": 0.0603, "step": 10120 }, { "epoch": 2.650572456656853, "grad_norm": 0.843837320804596, "learning_rate": 4.456363126576571e-05, "loss": 0.0594, "step": 10130 }, { "epoch": 2.653189401373896, "grad_norm": 0.5070556402206421, "learning_rate": 4.4550756582551273e-05, "loss": 0.0552, "step": 10140 }, { "epoch": 2.6558063460909387, "grad_norm": 0.6374284029006958, "learning_rate": 4.453786853657907e-05, "loss": 0.0566, "step": 10150 }, { "epoch": 2.6584232908079817, "grad_norm": 0.6199434399604797, "learning_rate": 4.452496713665794e-05, "loss": 0.0529, "step": 10160 }, { "epoch": 2.6610402355250247, "grad_norm": 0.8124728202819824, "learning_rate": 4.451205239160588e-05, "loss": 0.0639, "step": 10170 }, { "epoch": 2.663657180242067, "grad_norm": 0.5821810960769653, "learning_rate": 4.449912431025001e-05, "loss": 0.0637, "step": 10180 }, { "epoch": 2.66627412495911, "grad_norm": 0.4941767454147339, "learning_rate": 4.448618290142654e-05, "loss": 0.0539, "step": 10190 }, { "epoch": 2.668891069676153, "grad_norm": 0.7200390100479126, "learning_rate": 4.4473228173980794e-05, "loss": 0.0573, "step": 10200 }, { "epoch": 2.671508014393196, "grad_norm": 0.4795537292957306, "learning_rate": 4.446026013676722e-05, "loss": 0.0549, "step": 10210 }, { "epoch": 2.6741249591102387, "grad_norm": 0.5811921954154968, "learning_rate": 4.444727879864933e-05, "loss": 0.0588, "step": 10220 }, { "epoch": 2.6767419038272817, "grad_norm": 0.48299723863601685, "learning_rate": 4.4434284168499775e-05, "loss": 0.0538, "step": 10230 }, { "epoch": 2.6793588485443243, "grad_norm": 0.4856926500797272, "learning_rate": 4.442127625520023e-05, "loss": 0.0551, "step": 10240 }, { "epoch": 2.6819757932613673, "grad_norm": 0.5260676741600037, "learning_rate": 4.44082550676415e-05, "loss": 0.0543, "step": 10250 }, { "epoch": 2.6845927379784102, "grad_norm": 0.623572826385498, "learning_rate": 4.439522061472344e-05, "loss": 0.0596, "step": 10260 }, { "epoch": 2.6872096826954532, "grad_norm": 0.5810160636901855, "learning_rate": 4.438217290535498e-05, "loss": 0.06, "step": 10270 }, { "epoch": 2.689826627412496, "grad_norm": 0.6521799564361572, "learning_rate": 4.43691119484541e-05, "loss": 0.06, "step": 10280 }, { "epoch": 2.6924435721295388, "grad_norm": 0.5883389115333557, "learning_rate": 4.435603775294784e-05, "loss": 0.0642, "step": 10290 }, { "epoch": 2.6950605168465818, "grad_norm": 0.5264142751693726, "learning_rate": 4.434295032777229e-05, "loss": 0.0598, "step": 10300 }, { "epoch": 2.6976774615636243, "grad_norm": 0.728809118270874, "learning_rate": 4.432984968187259e-05, "loss": 0.0642, "step": 10310 }, { "epoch": 2.7002944062806673, "grad_norm": 0.4668984115123749, "learning_rate": 4.431673582420291e-05, "loss": 0.0506, "step": 10320 }, { "epoch": 2.7029113509977103, "grad_norm": 0.6090394854545593, "learning_rate": 4.4303608763726426e-05, "loss": 0.059, "step": 10330 }, { "epoch": 2.7055282957147533, "grad_norm": 0.399859756231308, "learning_rate": 4.4290468509415384e-05, "loss": 0.0584, "step": 10340 }, { "epoch": 2.708145240431796, "grad_norm": 0.4588572680950165, "learning_rate": 4.4277315070251e-05, "loss": 0.0568, "step": 10350 }, { "epoch": 2.710762185148839, "grad_norm": 0.5472003221511841, "learning_rate": 4.426414845522355e-05, "loss": 0.0596, "step": 10360 }, { "epoch": 2.7133791298658814, "grad_norm": 0.6493082642555237, "learning_rate": 4.425096867333228e-05, "loss": 0.0611, "step": 10370 }, { "epoch": 2.7159960745829244, "grad_norm": 0.7279231548309326, "learning_rate": 4.423777573358545e-05, "loss": 0.0505, "step": 10380 }, { "epoch": 2.7186130192999673, "grad_norm": 0.665616512298584, "learning_rate": 4.42245696450003e-05, "loss": 0.0652, "step": 10390 }, { "epoch": 2.7212299640170103, "grad_norm": 0.5311397314071655, "learning_rate": 4.4211350416603084e-05, "loss": 0.0617, "step": 10400 }, { "epoch": 2.723846908734053, "grad_norm": 0.6540460586547852, "learning_rate": 4.4198118057429005e-05, "loss": 0.0576, "step": 10410 }, { "epoch": 2.726463853451096, "grad_norm": 0.6157358288764954, "learning_rate": 4.4184872576522263e-05, "loss": 0.0589, "step": 10420 }, { "epoch": 2.7290807981681384, "grad_norm": 0.44291484355926514, "learning_rate": 4.417161398293602e-05, "loss": 0.0584, "step": 10430 }, { "epoch": 2.7316977428851814, "grad_norm": 0.6388154625892639, "learning_rate": 4.415834228573239e-05, "loss": 0.053, "step": 10440 }, { "epoch": 2.7343146876022244, "grad_norm": 0.4673249125480652, "learning_rate": 4.414505749398247e-05, "loss": 0.0585, "step": 10450 }, { "epoch": 2.7369316323192674, "grad_norm": 0.6346825957298279, "learning_rate": 4.4131759616766266e-05, "loss": 0.0576, "step": 10460 }, { "epoch": 2.7395485770363104, "grad_norm": 0.41277366876602173, "learning_rate": 4.4118448663172776e-05, "loss": 0.0588, "step": 10470 }, { "epoch": 2.742165521753353, "grad_norm": 0.7207258343696594, "learning_rate": 4.41051246422999e-05, "loss": 0.0572, "step": 10480 }, { "epoch": 2.744782466470396, "grad_norm": 0.6065157651901245, "learning_rate": 4.409178756325448e-05, "loss": 0.0641, "step": 10490 }, { "epoch": 2.7473994111874385, "grad_norm": 0.6917155981063843, "learning_rate": 4.407843743515229e-05, "loss": 0.0648, "step": 10500 }, { "epoch": 2.7500163559044815, "grad_norm": 0.5036665201187134, "learning_rate": 4.4065074267118e-05, "loss": 0.054, "step": 10510 }, { "epoch": 2.7526333006215244, "grad_norm": 0.6457655429840088, "learning_rate": 4.405169806828523e-05, "loss": 0.0515, "step": 10520 }, { "epoch": 2.7552502453385674, "grad_norm": 0.4316859245300293, "learning_rate": 4.403830884779647e-05, "loss": 0.0609, "step": 10530 }, { "epoch": 2.75786719005561, "grad_norm": 0.519899308681488, "learning_rate": 4.402490661480314e-05, "loss": 0.061, "step": 10540 }, { "epoch": 2.760484134772653, "grad_norm": 0.815555989742279, "learning_rate": 4.401149137846553e-05, "loss": 0.06, "step": 10550 }, { "epoch": 2.7631010794896955, "grad_norm": 0.5091633796691895, "learning_rate": 4.399806314795284e-05, "loss": 0.0583, "step": 10560 }, { "epoch": 2.7657180242067385, "grad_norm": 0.4408508241176605, "learning_rate": 4.398462193244312e-05, "loss": 0.0581, "step": 10570 }, { "epoch": 2.7683349689237815, "grad_norm": 0.5117101669311523, "learning_rate": 4.397116774112333e-05, "loss": 0.0588, "step": 10580 }, { "epoch": 2.7709519136408245, "grad_norm": 0.5849635601043701, "learning_rate": 4.3957700583189266e-05, "loss": 0.0553, "step": 10590 }, { "epoch": 2.773568858357867, "grad_norm": 0.6975322961807251, "learning_rate": 4.394422046784562e-05, "loss": 0.0537, "step": 10600 }, { "epoch": 2.77618580307491, "grad_norm": 0.3537542223930359, "learning_rate": 4.393072740430592e-05, "loss": 0.0565, "step": 10610 }, { "epoch": 2.778802747791953, "grad_norm": 0.5385840535163879, "learning_rate": 4.3917221401792536e-05, "loss": 0.0598, "step": 10620 }, { "epoch": 2.7814196925089956, "grad_norm": 0.4339829981327057, "learning_rate": 4.390370246953671e-05, "loss": 0.0601, "step": 10630 }, { "epoch": 2.7840366372260386, "grad_norm": 0.37706896662712097, "learning_rate": 4.389017061677849e-05, "loss": 0.0656, "step": 10640 }, { "epoch": 2.7866535819430815, "grad_norm": 0.6400270462036133, "learning_rate": 4.3876625852766785e-05, "loss": 0.062, "step": 10650 }, { "epoch": 2.7892705266601245, "grad_norm": 0.3444439172744751, "learning_rate": 4.38630681867593e-05, "loss": 0.0529, "step": 10660 }, { "epoch": 2.791887471377167, "grad_norm": 0.5224131345748901, "learning_rate": 4.384949762802258e-05, "loss": 0.056, "step": 10670 }, { "epoch": 2.79450441609421, "grad_norm": 0.5811564922332764, "learning_rate": 4.3835914185831985e-05, "loss": 0.0634, "step": 10680 }, { "epoch": 2.7971213608112526, "grad_norm": 0.5171063542366028, "learning_rate": 4.382231786947164e-05, "loss": 0.0592, "step": 10690 }, { "epoch": 2.7997383055282956, "grad_norm": 0.5354034900665283, "learning_rate": 4.380870868823451e-05, "loss": 0.0662, "step": 10700 }, { "epoch": 2.8023552502453386, "grad_norm": 0.3287551999092102, "learning_rate": 4.3795086651422355e-05, "loss": 0.0565, "step": 10710 }, { "epoch": 2.8049721949623816, "grad_norm": 0.6036696434020996, "learning_rate": 4.378145176834571e-05, "loss": 0.061, "step": 10720 }, { "epoch": 2.807589139679424, "grad_norm": 0.5706713199615479, "learning_rate": 4.376780404832387e-05, "loss": 0.0627, "step": 10730 }, { "epoch": 2.810206084396467, "grad_norm": 0.6217731833457947, "learning_rate": 4.375414350068493e-05, "loss": 0.0586, "step": 10740 }, { "epoch": 2.81282302911351, "grad_norm": 0.4816734790802002, "learning_rate": 4.374047013476575e-05, "loss": 0.0606, "step": 10750 }, { "epoch": 2.8154399738305527, "grad_norm": 0.4744681417942047, "learning_rate": 4.3726783959911956e-05, "loss": 0.0568, "step": 10760 }, { "epoch": 2.8180569185475957, "grad_norm": 0.49870118498802185, "learning_rate": 4.371308498547789e-05, "loss": 0.0543, "step": 10770 }, { "epoch": 2.8206738632646386, "grad_norm": 0.5821275115013123, "learning_rate": 4.3699373220826704e-05, "loss": 0.0542, "step": 10780 }, { "epoch": 2.8232908079816816, "grad_norm": 0.6296197772026062, "learning_rate": 4.368564867533024e-05, "loss": 0.0544, "step": 10790 }, { "epoch": 2.825907752698724, "grad_norm": 0.5080370903015137, "learning_rate": 4.3671911358369104e-05, "loss": 0.0633, "step": 10800 }, { "epoch": 2.828524697415767, "grad_norm": 0.46273985505104065, "learning_rate": 4.365816127933262e-05, "loss": 0.0594, "step": 10810 }, { "epoch": 2.8311416421328097, "grad_norm": 0.5995994806289673, "learning_rate": 4.3644398447618836e-05, "loss": 0.0541, "step": 10820 }, { "epoch": 2.8337585868498527, "grad_norm": 0.4902913272380829, "learning_rate": 4.363062287263453e-05, "loss": 0.055, "step": 10830 }, { "epoch": 2.8363755315668957, "grad_norm": 0.5302367210388184, "learning_rate": 4.361683456379515e-05, "loss": 0.0585, "step": 10840 }, { "epoch": 2.8389924762839387, "grad_norm": 0.49508044123649597, "learning_rate": 4.3603033530524896e-05, "loss": 0.0583, "step": 10850 }, { "epoch": 2.8416094210009812, "grad_norm": 0.4201105535030365, "learning_rate": 4.358921978225665e-05, "loss": 0.0611, "step": 10860 }, { "epoch": 2.8442263657180242, "grad_norm": 0.5711117386817932, "learning_rate": 4.357539332843196e-05, "loss": 0.0549, "step": 10870 }, { "epoch": 2.846843310435067, "grad_norm": 0.6540241837501526, "learning_rate": 4.356155417850109e-05, "loss": 0.0569, "step": 10880 }, { "epoch": 2.8494602551521098, "grad_norm": 0.5406968593597412, "learning_rate": 4.354770234192296e-05, "loss": 0.0527, "step": 10890 }, { "epoch": 2.8520771998691528, "grad_norm": 0.4803619384765625, "learning_rate": 4.353383782816517e-05, "loss": 0.056, "step": 10900 }, { "epoch": 2.8546941445861957, "grad_norm": 0.530622124671936, "learning_rate": 4.3519960646704e-05, "loss": 0.0558, "step": 10910 }, { "epoch": 2.8573110893032387, "grad_norm": 0.40140339732170105, "learning_rate": 4.350607080702435e-05, "loss": 0.0552, "step": 10920 }, { "epoch": 2.8599280340202813, "grad_norm": 0.45789000391960144, "learning_rate": 4.349216831861981e-05, "loss": 0.0607, "step": 10930 }, { "epoch": 2.8625449787373243, "grad_norm": 0.6772827506065369, "learning_rate": 4.347825319099259e-05, "loss": 0.0627, "step": 10940 }, { "epoch": 2.865161923454367, "grad_norm": 0.5811320543289185, "learning_rate": 4.3464325433653566e-05, "loss": 0.0573, "step": 10950 }, { "epoch": 2.86777886817141, "grad_norm": 0.5410882830619812, "learning_rate": 4.34503850561222e-05, "loss": 0.0563, "step": 10960 }, { "epoch": 2.870395812888453, "grad_norm": 0.7019317746162415, "learning_rate": 4.343643206792664e-05, "loss": 0.0677, "step": 10970 }, { "epoch": 2.873012757605496, "grad_norm": 0.5208338499069214, "learning_rate": 4.3422466478603593e-05, "loss": 0.0569, "step": 10980 }, { "epoch": 2.8756297023225383, "grad_norm": 0.49888139963150024, "learning_rate": 4.340848829769843e-05, "loss": 0.0571, "step": 10990 }, { "epoch": 2.8782466470395813, "grad_norm": 0.5914691090583801, "learning_rate": 4.3394497534765094e-05, "loss": 0.0604, "step": 11000 }, { "epoch": 2.8782466470395813, "eval_loss": 0.06439468686764342, "eval_runtime": 8.5706, "eval_samples_per_second": 119.478, "eval_steps_per_second": 1.867, "step": 11000 }, { "epoch": 2.880863591756624, "grad_norm": 0.5112311840057373, "learning_rate": 4.338049419936614e-05, "loss": 0.0585, "step": 11010 }, { "epoch": 2.883480536473667, "grad_norm": 0.47918373346328735, "learning_rate": 4.3366478301072723e-05, "loss": 0.0566, "step": 11020 }, { "epoch": 2.88609748119071, "grad_norm": 0.5077507495880127, "learning_rate": 4.335244984946457e-05, "loss": 0.0613, "step": 11030 }, { "epoch": 2.888714425907753, "grad_norm": 0.5855154991149902, "learning_rate": 4.333840885413e-05, "loss": 0.0545, "step": 11040 }, { "epoch": 2.8913313706247954, "grad_norm": 0.5740836262702942, "learning_rate": 4.33243553246659e-05, "loss": 0.0597, "step": 11050 }, { "epoch": 2.8939483153418384, "grad_norm": 0.6112914681434631, "learning_rate": 4.331028927067772e-05, "loss": 0.0578, "step": 11060 }, { "epoch": 2.8965652600588814, "grad_norm": 0.48574313521385193, "learning_rate": 4.329621070177948e-05, "loss": 0.0555, "step": 11070 }, { "epoch": 2.899182204775924, "grad_norm": 0.8778485059738159, "learning_rate": 4.328211962759375e-05, "loss": 0.0598, "step": 11080 }, { "epoch": 2.901799149492967, "grad_norm": 0.6654403805732727, "learning_rate": 4.326801605775165e-05, "loss": 0.0551, "step": 11090 }, { "epoch": 2.90441609421001, "grad_norm": 0.4592023491859436, "learning_rate": 4.325390000189283e-05, "loss": 0.0535, "step": 11100 }, { "epoch": 2.907033038927053, "grad_norm": 0.7717892527580261, "learning_rate": 4.323977146966548e-05, "loss": 0.0584, "step": 11110 }, { "epoch": 2.9096499836440954, "grad_norm": 0.7081303596496582, "learning_rate": 4.322563047072632e-05, "loss": 0.0559, "step": 11120 }, { "epoch": 2.9122669283611384, "grad_norm": 0.6562286019325256, "learning_rate": 4.3211477014740584e-05, "loss": 0.0654, "step": 11130 }, { "epoch": 2.914883873078181, "grad_norm": 0.5132052898406982, "learning_rate": 4.3197311111382045e-05, "loss": 0.0564, "step": 11140 }, { "epoch": 2.917500817795224, "grad_norm": 0.522114634513855, "learning_rate": 4.3183132770332946e-05, "loss": 0.0488, "step": 11150 }, { "epoch": 2.920117762512267, "grad_norm": 0.3157167434692383, "learning_rate": 4.3168942001284055e-05, "loss": 0.0593, "step": 11160 }, { "epoch": 2.92273470722931, "grad_norm": 0.49267107248306274, "learning_rate": 4.315473881393463e-05, "loss": 0.0635, "step": 11170 }, { "epoch": 2.9253516519463525, "grad_norm": 0.5157380104064941, "learning_rate": 4.3140523217992414e-05, "loss": 0.0551, "step": 11180 }, { "epoch": 2.9279685966633955, "grad_norm": 0.3554172217845917, "learning_rate": 4.312629522317363e-05, "loss": 0.066, "step": 11190 }, { "epoch": 2.9305855413804385, "grad_norm": 0.4762328267097473, "learning_rate": 4.3112054839202986e-05, "loss": 0.0577, "step": 11200 }, { "epoch": 2.933202486097481, "grad_norm": 0.7763202786445618, "learning_rate": 4.3097802075813655e-05, "loss": 0.0635, "step": 11210 }, { "epoch": 2.935819430814524, "grad_norm": 0.6447554230690002, "learning_rate": 4.308353694274724e-05, "loss": 0.0549, "step": 11220 }, { "epoch": 2.938436375531567, "grad_norm": 0.4910326600074768, "learning_rate": 4.3069259449753853e-05, "loss": 0.0599, "step": 11230 }, { "epoch": 2.94105332024861, "grad_norm": 0.8897386193275452, "learning_rate": 4.305496960659201e-05, "loss": 0.0593, "step": 11240 }, { "epoch": 2.9436702649656525, "grad_norm": 0.8142090439796448, "learning_rate": 4.304066742302869e-05, "loss": 0.0615, "step": 11250 }, { "epoch": 2.9462872096826955, "grad_norm": 0.49429208040237427, "learning_rate": 4.3026352908839295e-05, "loss": 0.0567, "step": 11260 }, { "epoch": 2.948904154399738, "grad_norm": 0.39475226402282715, "learning_rate": 4.301202607380768e-05, "loss": 0.0585, "step": 11270 }, { "epoch": 2.951521099116781, "grad_norm": 0.7416360378265381, "learning_rate": 4.2997686927726075e-05, "loss": 0.0539, "step": 11280 }, { "epoch": 2.954138043833824, "grad_norm": 0.6084833145141602, "learning_rate": 4.298333548039516e-05, "loss": 0.0535, "step": 11290 }, { "epoch": 2.956754988550867, "grad_norm": 0.484015554189682, "learning_rate": 4.296897174162403e-05, "loss": 0.0526, "step": 11300 }, { "epoch": 2.9593719332679096, "grad_norm": 0.5022487640380859, "learning_rate": 4.295459572123014e-05, "loss": 0.0573, "step": 11310 }, { "epoch": 2.9619888779849526, "grad_norm": 0.638661801815033, "learning_rate": 4.294020742903938e-05, "loss": 0.0573, "step": 11320 }, { "epoch": 2.9646058227019956, "grad_norm": 0.3423210084438324, "learning_rate": 4.292580687488601e-05, "loss": 0.0507, "step": 11330 }, { "epoch": 2.967222767419038, "grad_norm": 0.6734946370124817, "learning_rate": 4.2911394068612665e-05, "loss": 0.0567, "step": 11340 }, { "epoch": 2.969839712136081, "grad_norm": 0.5600636005401611, "learning_rate": 4.289696902007038e-05, "loss": 0.0618, "step": 11350 }, { "epoch": 2.972456656853124, "grad_norm": 0.6957277059555054, "learning_rate": 4.288253173911852e-05, "loss": 0.0641, "step": 11360 }, { "epoch": 2.975073601570167, "grad_norm": 0.4874526262283325, "learning_rate": 4.286808223562484e-05, "loss": 0.0617, "step": 11370 }, { "epoch": 2.9776905462872096, "grad_norm": 0.6502581834793091, "learning_rate": 4.285362051946543e-05, "loss": 0.0573, "step": 11380 }, { "epoch": 2.9803074910042526, "grad_norm": 0.5507333278656006, "learning_rate": 4.283914660052476e-05, "loss": 0.0575, "step": 11390 }, { "epoch": 2.982924435721295, "grad_norm": 0.6331663131713867, "learning_rate": 4.282466048869559e-05, "loss": 0.0629, "step": 11400 }, { "epoch": 2.985541380438338, "grad_norm": 0.4715561866760254, "learning_rate": 4.2810162193879053e-05, "loss": 0.059, "step": 11410 }, { "epoch": 2.988158325155381, "grad_norm": 0.664364755153656, "learning_rate": 4.279565172598461e-05, "loss": 0.0556, "step": 11420 }, { "epoch": 2.990775269872424, "grad_norm": 0.7428573966026306, "learning_rate": 4.278112909493e-05, "loss": 0.0562, "step": 11430 }, { "epoch": 2.9933922145894667, "grad_norm": 0.630793035030365, "learning_rate": 4.2766594310641326e-05, "loss": 0.0554, "step": 11440 }, { "epoch": 2.9960091593065097, "grad_norm": 0.6804364323616028, "learning_rate": 4.2752047383052966e-05, "loss": 0.056, "step": 11450 }, { "epoch": 2.9986261040235522, "grad_norm": 0.5913823843002319, "learning_rate": 4.273748832210761e-05, "loss": 0.0594, "step": 11460 }, { "epoch": 3.0010467778868173, "grad_norm": 0.6175287961959839, "learning_rate": 4.2722917137756245e-05, "loss": 0.0534, "step": 11470 }, { "epoch": 3.00366372260386, "grad_norm": 0.5993890762329102, "learning_rate": 4.270833383995814e-05, "loss": 0.0568, "step": 11480 }, { "epoch": 3.006280667320903, "grad_norm": 0.5144372582435608, "learning_rate": 4.269373843868083e-05, "loss": 0.0542, "step": 11490 }, { "epoch": 3.008897612037946, "grad_norm": 0.5794318914413452, "learning_rate": 4.267913094390013e-05, "loss": 0.0562, "step": 11500 }, { "epoch": 3.0115145567549884, "grad_norm": 0.7506561875343323, "learning_rate": 4.266451136560014e-05, "loss": 0.0594, "step": 11510 }, { "epoch": 3.0141315014720313, "grad_norm": 0.5357182621955872, "learning_rate": 4.26498797137732e-05, "loss": 0.0607, "step": 11520 }, { "epoch": 3.0167484461890743, "grad_norm": 0.669437050819397, "learning_rate": 4.26352359984199e-05, "loss": 0.0658, "step": 11530 }, { "epoch": 3.019365390906117, "grad_norm": 0.8581656813621521, "learning_rate": 4.262058022954909e-05, "loss": 0.0651, "step": 11540 }, { "epoch": 3.02198233562316, "grad_norm": 0.49288684129714966, "learning_rate": 4.2605912417177846e-05, "loss": 0.0598, "step": 11550 }, { "epoch": 3.024599280340203, "grad_norm": 0.5953378081321716, "learning_rate": 4.2591232571331476e-05, "loss": 0.0667, "step": 11560 }, { "epoch": 3.027216225057246, "grad_norm": 0.6922196745872498, "learning_rate": 4.2576540702043516e-05, "loss": 0.0542, "step": 11570 }, { "epoch": 3.0298331697742884, "grad_norm": 0.6142755150794983, "learning_rate": 4.256183681935573e-05, "loss": 0.0606, "step": 11580 }, { "epoch": 3.0324501144913314, "grad_norm": 0.6295666694641113, "learning_rate": 4.254712093331807e-05, "loss": 0.0582, "step": 11590 }, { "epoch": 3.0350670592083744, "grad_norm": 0.6455519199371338, "learning_rate": 4.2532393053988715e-05, "loss": 0.0546, "step": 11600 }, { "epoch": 3.037684003925417, "grad_norm": 0.5940701365470886, "learning_rate": 4.2517653191434026e-05, "loss": 0.0621, "step": 11610 }, { "epoch": 3.04030094864246, "grad_norm": 0.5957797765731812, "learning_rate": 4.250290135572856e-05, "loss": 0.0633, "step": 11620 }, { "epoch": 3.042917893359503, "grad_norm": 0.794637143611908, "learning_rate": 4.248813755695507e-05, "loss": 0.0607, "step": 11630 }, { "epoch": 3.0455348380765455, "grad_norm": 0.8148157596588135, "learning_rate": 4.2473361805204453e-05, "loss": 0.0671, "step": 11640 }, { "epoch": 3.0481517827935884, "grad_norm": 0.6912362575531006, "learning_rate": 4.245857411057581e-05, "loss": 0.0609, "step": 11650 }, { "epoch": 3.0507687275106314, "grad_norm": 0.5286144018173218, "learning_rate": 4.244377448317638e-05, "loss": 0.058, "step": 11660 }, { "epoch": 3.053385672227674, "grad_norm": 0.5043061375617981, "learning_rate": 4.242896293312159e-05, "loss": 0.0573, "step": 11670 }, { "epoch": 3.056002616944717, "grad_norm": 0.49938568472862244, "learning_rate": 4.2414139470534965e-05, "loss": 0.0585, "step": 11680 }, { "epoch": 3.05861956166176, "grad_norm": 0.5215581655502319, "learning_rate": 4.239930410554823e-05, "loss": 0.0549, "step": 11690 }, { "epoch": 3.061236506378803, "grad_norm": 0.8619711995124817, "learning_rate": 4.238445684830119e-05, "loss": 0.0628, "step": 11700 }, { "epoch": 3.0638534510958455, "grad_norm": 0.4214211106300354, "learning_rate": 4.236959770894183e-05, "loss": 0.0566, "step": 11710 }, { "epoch": 3.0664703958128885, "grad_norm": 0.7512931823730469, "learning_rate": 4.235472669762622e-05, "loss": 0.0522, "step": 11720 }, { "epoch": 3.0690873405299315, "grad_norm": 0.6468842029571533, "learning_rate": 4.233984382451856e-05, "loss": 0.0599, "step": 11730 }, { "epoch": 3.071704285246974, "grad_norm": 0.719258725643158, "learning_rate": 4.232494909979115e-05, "loss": 0.0584, "step": 11740 }, { "epoch": 3.074321229964017, "grad_norm": 0.9053614139556885, "learning_rate": 4.2310042533624395e-05, "loss": 0.0646, "step": 11750 }, { "epoch": 3.07693817468106, "grad_norm": 0.8233187794685364, "learning_rate": 4.2295124136206794e-05, "loss": 0.0572, "step": 11760 }, { "epoch": 3.0795551193981026, "grad_norm": 0.6152809262275696, "learning_rate": 4.2280193917734926e-05, "loss": 0.058, "step": 11770 }, { "epoch": 3.0821720641151455, "grad_norm": 0.5993651747703552, "learning_rate": 4.226525188841346e-05, "loss": 0.0512, "step": 11780 }, { "epoch": 3.0847890088321885, "grad_norm": 0.4626370370388031, "learning_rate": 4.225029805845513e-05, "loss": 0.0548, "step": 11790 }, { "epoch": 3.087405953549231, "grad_norm": 0.6150082945823669, "learning_rate": 4.223533243808073e-05, "loss": 0.064, "step": 11800 }, { "epoch": 3.090022898266274, "grad_norm": 0.3250552713871002, "learning_rate": 4.222035503751913e-05, "loss": 0.0595, "step": 11810 }, { "epoch": 3.092639842983317, "grad_norm": 0.6475739479064941, "learning_rate": 4.220536586700724e-05, "loss": 0.0565, "step": 11820 }, { "epoch": 3.09525678770036, "grad_norm": 0.5960320234298706, "learning_rate": 4.219036493679003e-05, "loss": 0.0668, "step": 11830 }, { "epoch": 3.0978737324174026, "grad_norm": 0.8178985118865967, "learning_rate": 4.217535225712047e-05, "loss": 0.0617, "step": 11840 }, { "epoch": 3.1004906771344456, "grad_norm": 0.4452805519104004, "learning_rate": 4.2160327838259594e-05, "loss": 0.0649, "step": 11850 }, { "epoch": 3.1031076218514886, "grad_norm": 0.5689908266067505, "learning_rate": 4.214529169047646e-05, "loss": 0.0508, "step": 11860 }, { "epoch": 3.105724566568531, "grad_norm": 0.4972374737262726, "learning_rate": 4.213024382404812e-05, "loss": 0.0585, "step": 11870 }, { "epoch": 3.108341511285574, "grad_norm": 0.3974771201610565, "learning_rate": 4.211518424925966e-05, "loss": 0.0584, "step": 11880 }, { "epoch": 3.110958456002617, "grad_norm": 0.6321877241134644, "learning_rate": 4.210011297640415e-05, "loss": 0.0571, "step": 11890 }, { "epoch": 3.1135754007196597, "grad_norm": 0.7349003553390503, "learning_rate": 4.208503001578266e-05, "loss": 0.0628, "step": 11900 }, { "epoch": 3.1161923454367026, "grad_norm": 0.6462802886962891, "learning_rate": 4.206993537770426e-05, "loss": 0.0591, "step": 11910 }, { "epoch": 3.1188092901537456, "grad_norm": 0.6877278685569763, "learning_rate": 4.2054829072486e-05, "loss": 0.0595, "step": 11920 }, { "epoch": 3.121426234870788, "grad_norm": 0.6100813746452332, "learning_rate": 4.2039711110452866e-05, "loss": 0.0656, "step": 11930 }, { "epoch": 3.124043179587831, "grad_norm": 0.4432187080383301, "learning_rate": 4.202458150193788e-05, "loss": 0.065, "step": 11940 }, { "epoch": 3.126660124304874, "grad_norm": 0.42920616269111633, "learning_rate": 4.2009440257281956e-05, "loss": 0.062, "step": 11950 }, { "epoch": 3.1292770690219167, "grad_norm": 0.5105937123298645, "learning_rate": 4.1994287386834014e-05, "loss": 0.0533, "step": 11960 }, { "epoch": 3.1318940137389597, "grad_norm": 0.42014339566230774, "learning_rate": 4.197912290095089e-05, "loss": 0.0605, "step": 11970 }, { "epoch": 3.1345109584560027, "grad_norm": 0.45812082290649414, "learning_rate": 4.1963946809997366e-05, "loss": 0.0551, "step": 11980 }, { "epoch": 3.1371279031730452, "grad_norm": 0.423980712890625, "learning_rate": 4.194875912434615e-05, "loss": 0.054, "step": 11990 }, { "epoch": 3.1397448478900882, "grad_norm": 0.47686734795570374, "learning_rate": 4.1933559854377904e-05, "loss": 0.0623, "step": 12000 }, { "epoch": 3.1397448478900882, "eval_loss": 0.06390022339072847, "eval_runtime": 8.8627, "eval_samples_per_second": 115.541, "eval_steps_per_second": 1.805, "step": 12000 }, { "epoch": 3.142361792607131, "grad_norm": 0.8884722590446472, "learning_rate": 4.191834901048116e-05, "loss": 0.0578, "step": 12010 }, { "epoch": 3.144978737324174, "grad_norm": 0.4640854299068451, "learning_rate": 4.19031266030524e-05, "loss": 0.0563, "step": 12020 }, { "epoch": 3.1475956820412168, "grad_norm": 0.5811957120895386, "learning_rate": 4.1887892642496e-05, "loss": 0.0567, "step": 12030 }, { "epoch": 3.1502126267582597, "grad_norm": 0.6208037734031677, "learning_rate": 4.1872647139224215e-05, "loss": 0.0544, "step": 12040 }, { "epoch": 3.1528295714753027, "grad_norm": 0.6058433055877686, "learning_rate": 4.185739010365721e-05, "loss": 0.0579, "step": 12050 }, { "epoch": 3.1554465161923453, "grad_norm": 0.7506754994392395, "learning_rate": 4.1842121546223034e-05, "loss": 0.0568, "step": 12060 }, { "epoch": 3.1580634609093883, "grad_norm": 0.5643253326416016, "learning_rate": 4.1826841477357584e-05, "loss": 0.0547, "step": 12070 }, { "epoch": 3.1606804056264313, "grad_norm": 0.5418350696563721, "learning_rate": 4.1811549907504654e-05, "loss": 0.0557, "step": 12080 }, { "epoch": 3.163297350343474, "grad_norm": 0.8077098727226257, "learning_rate": 4.1796246847115886e-05, "loss": 0.0558, "step": 12090 }, { "epoch": 3.165914295060517, "grad_norm": 0.42577677965164185, "learning_rate": 4.1780932306650775e-05, "loss": 0.0592, "step": 12100 }, { "epoch": 3.16853123977756, "grad_norm": 0.7265817523002625, "learning_rate": 4.176560629657667e-05, "loss": 0.0602, "step": 12110 }, { "epoch": 3.1711481844946023, "grad_norm": 0.7095639705657959, "learning_rate": 4.175026882736876e-05, "loss": 0.0566, "step": 12120 }, { "epoch": 3.1737651292116453, "grad_norm": 0.4474140703678131, "learning_rate": 4.173491990951003e-05, "loss": 0.0559, "step": 12130 }, { "epoch": 3.1763820739286883, "grad_norm": 0.4180569648742676, "learning_rate": 4.1719559553491356e-05, "loss": 0.0519, "step": 12140 }, { "epoch": 3.1789990186457313, "grad_norm": 0.5319985151290894, "learning_rate": 4.170418776981139e-05, "loss": 0.0552, "step": 12150 }, { "epoch": 3.181615963362774, "grad_norm": 0.4087623953819275, "learning_rate": 4.168880456897658e-05, "loss": 0.057, "step": 12160 }, { "epoch": 3.184232908079817, "grad_norm": 0.7610423564910889, "learning_rate": 4.167340996150122e-05, "loss": 0.0563, "step": 12170 }, { "epoch": 3.18684985279686, "grad_norm": 0.40939995646476746, "learning_rate": 4.165800395790737e-05, "loss": 0.0565, "step": 12180 }, { "epoch": 3.1894667975139024, "grad_norm": 0.6325905919075012, "learning_rate": 4.164258656872489e-05, "loss": 0.0521, "step": 12190 }, { "epoch": 3.1920837422309454, "grad_norm": 0.5581347942352295, "learning_rate": 4.162715780449143e-05, "loss": 0.0606, "step": 12200 }, { "epoch": 3.1947006869479884, "grad_norm": 0.5394080281257629, "learning_rate": 4.161171767575239e-05, "loss": 0.0567, "step": 12210 }, { "epoch": 3.197317631665031, "grad_norm": 0.47330865263938904, "learning_rate": 4.1596266193060954e-05, "loss": 0.0548, "step": 12220 }, { "epoch": 3.199934576382074, "grad_norm": 0.5119001269340515, "learning_rate": 4.158080336697807e-05, "loss": 0.0584, "step": 12230 }, { "epoch": 3.202551521099117, "grad_norm": 0.4136107265949249, "learning_rate": 4.1565329208072437e-05, "loss": 0.0589, "step": 12240 }, { "epoch": 3.2051684658161594, "grad_norm": 0.8875516057014465, "learning_rate": 4.154984372692048e-05, "loss": 0.0514, "step": 12250 }, { "epoch": 3.2077854105332024, "grad_norm": 0.559062659740448, "learning_rate": 4.153434693410641e-05, "loss": 0.0556, "step": 12260 }, { "epoch": 3.2104023552502454, "grad_norm": 0.6495611667633057, "learning_rate": 4.15188388402221e-05, "loss": 0.0607, "step": 12270 }, { "epoch": 3.2130192999672884, "grad_norm": 0.6440935730934143, "learning_rate": 4.1503319455867215e-05, "loss": 0.0556, "step": 12280 }, { "epoch": 3.215636244684331, "grad_norm": 0.4232413172721863, "learning_rate": 4.148778879164911e-05, "loss": 0.0505, "step": 12290 }, { "epoch": 3.218253189401374, "grad_norm": 0.7083742022514343, "learning_rate": 4.147224685818282e-05, "loss": 0.0592, "step": 12300 }, { "epoch": 3.220870134118417, "grad_norm": 0.591503381729126, "learning_rate": 4.145669366609113e-05, "loss": 0.0577, "step": 12310 }, { "epoch": 3.2234870788354595, "grad_norm": 0.5156564116477966, "learning_rate": 4.14411292260045e-05, "loss": 0.057, "step": 12320 }, { "epoch": 3.2261040235525025, "grad_norm": 0.5001283884048462, "learning_rate": 4.142555354856107e-05, "loss": 0.0503, "step": 12330 }, { "epoch": 3.2287209682695455, "grad_norm": 0.5152615308761597, "learning_rate": 4.1409966644406686e-05, "loss": 0.0617, "step": 12340 }, { "epoch": 3.231337912986588, "grad_norm": 0.7098157405853271, "learning_rate": 4.139436852419482e-05, "loss": 0.0605, "step": 12350 }, { "epoch": 3.233954857703631, "grad_norm": 0.7927548289299011, "learning_rate": 4.137875919858667e-05, "loss": 0.0512, "step": 12360 }, { "epoch": 3.236571802420674, "grad_norm": 0.5227700471878052, "learning_rate": 4.136313867825104e-05, "loss": 0.0524, "step": 12370 }, { "epoch": 3.2391887471377165, "grad_norm": 0.8455647826194763, "learning_rate": 4.134750697386442e-05, "loss": 0.0515, "step": 12380 }, { "epoch": 3.2418056918547595, "grad_norm": 0.597399890422821, "learning_rate": 4.133186409611094e-05, "loss": 0.0503, "step": 12390 }, { "epoch": 3.2444226365718025, "grad_norm": 0.7307800650596619, "learning_rate": 4.131621005568235e-05, "loss": 0.0522, "step": 12400 }, { "epoch": 3.247039581288845, "grad_norm": 0.49984949827194214, "learning_rate": 4.130054486327803e-05, "loss": 0.0514, "step": 12410 }, { "epoch": 3.249656526005888, "grad_norm": 0.5808200836181641, "learning_rate": 4.1284868529605e-05, "loss": 0.0512, "step": 12420 }, { "epoch": 3.252273470722931, "grad_norm": 0.739686131477356, "learning_rate": 4.12691810653779e-05, "loss": 0.0586, "step": 12430 }, { "epoch": 3.2548904154399736, "grad_norm": 0.5442174077033997, "learning_rate": 4.125348248131895e-05, "loss": 0.0519, "step": 12440 }, { "epoch": 3.2575073601570166, "grad_norm": 0.6988082528114319, "learning_rate": 4.123777278815798e-05, "loss": 0.0534, "step": 12450 }, { "epoch": 3.2601243048740596, "grad_norm": 0.7346671223640442, "learning_rate": 4.1222051996632415e-05, "loss": 0.0493, "step": 12460 }, { "epoch": 3.2627412495911026, "grad_norm": 0.6555301547050476, "learning_rate": 4.1206320117487285e-05, "loss": 0.0589, "step": 12470 }, { "epoch": 3.265358194308145, "grad_norm": 0.6752069592475891, "learning_rate": 4.119057716147517e-05, "loss": 0.0582, "step": 12480 }, { "epoch": 3.267975139025188, "grad_norm": 0.5889864563941956, "learning_rate": 4.117482313935623e-05, "loss": 0.0545, "step": 12490 }, { "epoch": 3.270592083742231, "grad_norm": 0.506980836391449, "learning_rate": 4.1159058061898195e-05, "loss": 0.0536, "step": 12500 }, { "epoch": 3.2732090284592736, "grad_norm": 0.49957287311553955, "learning_rate": 4.114328193987634e-05, "loss": 0.0541, "step": 12510 }, { "epoch": 3.2758259731763166, "grad_norm": 0.39040929079055786, "learning_rate": 4.112749478407351e-05, "loss": 0.0503, "step": 12520 }, { "epoch": 3.2784429178933596, "grad_norm": 0.5606443285942078, "learning_rate": 4.111169660528007e-05, "loss": 0.0527, "step": 12530 }, { "epoch": 3.281059862610402, "grad_norm": 0.3037455677986145, "learning_rate": 4.109588741429392e-05, "loss": 0.054, "step": 12540 }, { "epoch": 3.283676807327445, "grad_norm": 0.5493094325065613, "learning_rate": 4.1080067221920494e-05, "loss": 0.0492, "step": 12550 }, { "epoch": 3.286293752044488, "grad_norm": 0.9509132504463196, "learning_rate": 4.106423603897275e-05, "loss": 0.0608, "step": 12560 }, { "epoch": 3.2889106967615307, "grad_norm": 0.6977026462554932, "learning_rate": 4.1048393876271155e-05, "loss": 0.0492, "step": 12570 }, { "epoch": 3.2915276414785737, "grad_norm": 0.5020390748977661, "learning_rate": 4.1032540744643666e-05, "loss": 0.0599, "step": 12580 }, { "epoch": 3.2941445861956167, "grad_norm": 0.8102987408638, "learning_rate": 4.101667665492576e-05, "loss": 0.0535, "step": 12590 }, { "epoch": 3.2967615309126597, "grad_norm": 0.4310251474380493, "learning_rate": 4.100080161796038e-05, "loss": 0.0499, "step": 12600 }, { "epoch": 3.299378475629702, "grad_norm": 0.5380797982215881, "learning_rate": 4.098491564459799e-05, "loss": 0.047, "step": 12610 }, { "epoch": 3.301995420346745, "grad_norm": 0.4607020318508148, "learning_rate": 4.0969018745696476e-05, "loss": 0.0536, "step": 12620 }, { "epoch": 3.304612365063788, "grad_norm": 0.6233174204826355, "learning_rate": 4.095311093212122e-05, "loss": 0.0569, "step": 12630 }, { "epoch": 3.3072293097808307, "grad_norm": 0.5668174028396606, "learning_rate": 4.093719221474508e-05, "loss": 0.0555, "step": 12640 }, { "epoch": 3.3098462544978737, "grad_norm": 0.32410597801208496, "learning_rate": 4.092126260444834e-05, "loss": 0.05, "step": 12650 }, { "epoch": 3.3124631992149167, "grad_norm": 0.5236939787864685, "learning_rate": 4.090532211211874e-05, "loss": 0.0583, "step": 12660 }, { "epoch": 3.3150801439319593, "grad_norm": 0.6198887825012207, "learning_rate": 4.0889370748651446e-05, "loss": 0.0521, "step": 12670 }, { "epoch": 3.3176970886490023, "grad_norm": 0.5270038843154907, "learning_rate": 4.087340852494908e-05, "loss": 0.0537, "step": 12680 }, { "epoch": 3.3203140333660452, "grad_norm": 0.42099273204803467, "learning_rate": 4.0857435451921664e-05, "loss": 0.0558, "step": 12690 }, { "epoch": 3.322930978083088, "grad_norm": 0.36083897948265076, "learning_rate": 4.084145154048664e-05, "loss": 0.0515, "step": 12700 }, { "epoch": 3.325547922800131, "grad_norm": 0.445707231760025, "learning_rate": 4.082545680156887e-05, "loss": 0.0537, "step": 12710 }, { "epoch": 3.3281648675171738, "grad_norm": 0.5661662220954895, "learning_rate": 4.0809451246100594e-05, "loss": 0.0615, "step": 12720 }, { "epoch": 3.3307818122342168, "grad_norm": 0.8061146140098572, "learning_rate": 4.0793434885021475e-05, "loss": 0.0618, "step": 12730 }, { "epoch": 3.3333987569512593, "grad_norm": 0.666907012462616, "learning_rate": 4.077740772927853e-05, "loss": 0.0509, "step": 12740 }, { "epoch": 3.3360157016683023, "grad_norm": 0.46689221262931824, "learning_rate": 4.0761369789826166e-05, "loss": 0.0581, "step": 12750 }, { "epoch": 3.3386326463853453, "grad_norm": 0.6941690444946289, "learning_rate": 4.074532107762619e-05, "loss": 0.0637, "step": 12760 }, { "epoch": 3.341249591102388, "grad_norm": 0.5134106874465942, "learning_rate": 4.07292616036477e-05, "loss": 0.0579, "step": 12770 }, { "epoch": 3.343866535819431, "grad_norm": 0.43664735555648804, "learning_rate": 4.071319137886724e-05, "loss": 0.054, "step": 12780 }, { "epoch": 3.346483480536474, "grad_norm": 0.5271178483963013, "learning_rate": 4.0697110414268644e-05, "loss": 0.0622, "step": 12790 }, { "epoch": 3.3491004252535164, "grad_norm": 0.536749541759491, "learning_rate": 4.068101872084309e-05, "loss": 0.0568, "step": 12800 }, { "epoch": 3.3517173699705594, "grad_norm": 0.4899667799472809, "learning_rate": 4.0664916309589093e-05, "loss": 0.0501, "step": 12810 }, { "epoch": 3.3543343146876023, "grad_norm": 0.4232407212257385, "learning_rate": 4.064880319151252e-05, "loss": 0.0592, "step": 12820 }, { "epoch": 3.356951259404645, "grad_norm": 0.47795727849006653, "learning_rate": 4.063267937762652e-05, "loss": 0.049, "step": 12830 }, { "epoch": 3.359568204121688, "grad_norm": 0.5101355314254761, "learning_rate": 4.061654487895158e-05, "loss": 0.0506, "step": 12840 }, { "epoch": 3.362185148838731, "grad_norm": 0.4006783068180084, "learning_rate": 4.060039970651547e-05, "loss": 0.0562, "step": 12850 }, { "epoch": 3.364802093555774, "grad_norm": 0.5601989030838013, "learning_rate": 4.0584243871353257e-05, "loss": 0.0571, "step": 12860 }, { "epoch": 3.3674190382728164, "grad_norm": 0.478302001953125, "learning_rate": 4.0568077384507306e-05, "loss": 0.0587, "step": 12870 }, { "epoch": 3.3700359829898594, "grad_norm": 0.5545663833618164, "learning_rate": 4.055190025702727e-05, "loss": 0.0489, "step": 12880 }, { "epoch": 3.372652927706902, "grad_norm": 0.5649152398109436, "learning_rate": 4.0535712499970045e-05, "loss": 0.0519, "step": 12890 }, { "epoch": 3.375269872423945, "grad_norm": 0.4853672981262207, "learning_rate": 4.051951412439983e-05, "loss": 0.0618, "step": 12900 }, { "epoch": 3.377886817140988, "grad_norm": 0.30773958563804626, "learning_rate": 4.050330514138805e-05, "loss": 0.0473, "step": 12910 }, { "epoch": 3.380503761858031, "grad_norm": 0.44944265484809875, "learning_rate": 4.048708556201338e-05, "loss": 0.0561, "step": 12920 }, { "epoch": 3.3831207065750735, "grad_norm": 0.5097119212150574, "learning_rate": 4.047085539736177e-05, "loss": 0.0609, "step": 12930 }, { "epoch": 3.3857376512921165, "grad_norm": 0.4472697675228119, "learning_rate": 4.0454614658526384e-05, "loss": 0.0542, "step": 12940 }, { "epoch": 3.3883545960091594, "grad_norm": 0.45250511169433594, "learning_rate": 4.04383633566076e-05, "loss": 0.0567, "step": 12950 }, { "epoch": 3.390971540726202, "grad_norm": 0.5949851870536804, "learning_rate": 4.042210150271304e-05, "loss": 0.055, "step": 12960 }, { "epoch": 3.393588485443245, "grad_norm": 0.5202013254165649, "learning_rate": 4.0405829107957525e-05, "loss": 0.0502, "step": 12970 }, { "epoch": 3.396205430160288, "grad_norm": 0.3496212065219879, "learning_rate": 4.038954618346308e-05, "loss": 0.0538, "step": 12980 }, { "epoch": 3.398822374877331, "grad_norm": 0.47482752799987793, "learning_rate": 4.0373252740358936e-05, "loss": 0.0571, "step": 12990 }, { "epoch": 3.4014393195943735, "grad_norm": 0.6373538970947266, "learning_rate": 4.035694878978151e-05, "loss": 0.0558, "step": 13000 }, { "epoch": 3.4014393195943735, "eval_loss": 0.06179543161421813, "eval_runtime": 8.9033, "eval_samples_per_second": 115.014, "eval_steps_per_second": 1.797, "step": 13000 }, { "epoch": 3.4040562643114165, "grad_norm": 0.40769073367118835, "learning_rate": 4.034063434287438e-05, "loss": 0.0486, "step": 13010 }, { "epoch": 3.406673209028459, "grad_norm": 0.43151283264160156, "learning_rate": 4.032430941078834e-05, "loss": 0.0584, "step": 13020 }, { "epoch": 3.409290153745502, "grad_norm": 0.601318359375, "learning_rate": 4.030797400468132e-05, "loss": 0.0467, "step": 13030 }, { "epoch": 3.411907098462545, "grad_norm": 0.5108250379562378, "learning_rate": 4.0291628135718404e-05, "loss": 0.0528, "step": 13040 }, { "epoch": 3.414524043179588, "grad_norm": 0.5723781585693359, "learning_rate": 4.027527181507186e-05, "loss": 0.055, "step": 13050 }, { "epoch": 3.4171409878966306, "grad_norm": 0.5672504305839539, "learning_rate": 4.0258905053921056e-05, "loss": 0.0553, "step": 13060 }, { "epoch": 3.4197579326136736, "grad_norm": 0.4763956367969513, "learning_rate": 4.024252786345253e-05, "loss": 0.0492, "step": 13070 }, { "epoch": 3.4223748773307165, "grad_norm": 0.5045111775398254, "learning_rate": 4.022614025485994e-05, "loss": 0.0606, "step": 13080 }, { "epoch": 3.424991822047759, "grad_norm": 0.529390811920166, "learning_rate": 4.020974223934407e-05, "loss": 0.0565, "step": 13090 }, { "epoch": 3.427608766764802, "grad_norm": 0.7006625533103943, "learning_rate": 4.019333382811279e-05, "loss": 0.0521, "step": 13100 }, { "epoch": 3.430225711481845, "grad_norm": 0.35650935769081116, "learning_rate": 4.01769150323811e-05, "loss": 0.0599, "step": 13110 }, { "epoch": 3.4328426561988876, "grad_norm": 0.7201257348060608, "learning_rate": 4.01604858633711e-05, "loss": 0.0532, "step": 13120 }, { "epoch": 3.4354596009159306, "grad_norm": 0.5479598641395569, "learning_rate": 4.014404633231198e-05, "loss": 0.0567, "step": 13130 }, { "epoch": 3.4380765456329736, "grad_norm": 0.4715985655784607, "learning_rate": 4.012759645043997e-05, "loss": 0.0504, "step": 13140 }, { "epoch": 3.440693490350016, "grad_norm": 0.4740786552429199, "learning_rate": 4.011113622899844e-05, "loss": 0.0545, "step": 13150 }, { "epoch": 3.443310435067059, "grad_norm": 0.3725115656852722, "learning_rate": 4.0094665679237786e-05, "loss": 0.0513, "step": 13160 }, { "epoch": 3.445927379784102, "grad_norm": 0.5373284220695496, "learning_rate": 4.007818481241548e-05, "loss": 0.0552, "step": 13170 }, { "epoch": 3.448544324501145, "grad_norm": 0.8126721382141113, "learning_rate": 4.006169363979603e-05, "loss": 0.059, "step": 13180 }, { "epoch": 3.4511612692181877, "grad_norm": 0.5969352126121521, "learning_rate": 4.004519217265099e-05, "loss": 0.0569, "step": 13190 }, { "epoch": 3.4537782139352307, "grad_norm": 0.5900636911392212, "learning_rate": 4.002868042225898e-05, "loss": 0.051, "step": 13200 }, { "epoch": 3.4563951586522736, "grad_norm": 0.663048505783081, "learning_rate": 4.001215839990561e-05, "loss": 0.0556, "step": 13210 }, { "epoch": 3.459012103369316, "grad_norm": 1.137934923171997, "learning_rate": 3.999562611688353e-05, "loss": 0.0507, "step": 13220 }, { "epoch": 3.461629048086359, "grad_norm": 0.4088614881038666, "learning_rate": 3.99790835844924e-05, "loss": 0.0567, "step": 13230 }, { "epoch": 3.464245992803402, "grad_norm": 0.42933085560798645, "learning_rate": 3.996253081403888e-05, "loss": 0.0488, "step": 13240 }, { "epoch": 3.4668629375204447, "grad_norm": 0.5075600743293762, "learning_rate": 3.994596781683664e-05, "loss": 0.0534, "step": 13250 }, { "epoch": 3.4694798822374877, "grad_norm": 0.6735544800758362, "learning_rate": 3.992939460420633e-05, "loss": 0.0558, "step": 13260 }, { "epoch": 3.4720968269545307, "grad_norm": 0.6506981253623962, "learning_rate": 3.991281118747558e-05, "loss": 0.0522, "step": 13270 }, { "epoch": 3.4747137716715732, "grad_norm": 0.3499435484409332, "learning_rate": 3.989621757797901e-05, "loss": 0.0532, "step": 13280 }, { "epoch": 3.4773307163886162, "grad_norm": 0.5630961060523987, "learning_rate": 3.987961378705818e-05, "loss": 0.0556, "step": 13290 }, { "epoch": 3.4799476611056592, "grad_norm": 0.7153360247612, "learning_rate": 3.986299982606164e-05, "loss": 0.0507, "step": 13300 }, { "epoch": 3.482564605822702, "grad_norm": 0.5627347230911255, "learning_rate": 3.9846375706344864e-05, "loss": 0.0551, "step": 13310 }, { "epoch": 3.4851815505397448, "grad_norm": 0.5859262347221375, "learning_rate": 3.98297414392703e-05, "loss": 0.0557, "step": 13320 }, { "epoch": 3.4877984952567878, "grad_norm": 0.6154468655586243, "learning_rate": 3.981309703620728e-05, "loss": 0.0603, "step": 13330 }, { "epoch": 3.4904154399738307, "grad_norm": 0.4030719995498657, "learning_rate": 3.979644250853212e-05, "loss": 0.0455, "step": 13340 }, { "epoch": 3.4930323846908733, "grad_norm": 0.42505738139152527, "learning_rate": 3.9779777867628023e-05, "loss": 0.0554, "step": 13350 }, { "epoch": 3.4956493294079163, "grad_norm": 0.6388312578201294, "learning_rate": 3.976310312488513e-05, "loss": 0.0597, "step": 13360 }, { "epoch": 3.4982662741249593, "grad_norm": 0.5594176650047302, "learning_rate": 3.9746418291700446e-05, "loss": 0.0511, "step": 13370 }, { "epoch": 3.500883218842002, "grad_norm": 0.4737611413002014, "learning_rate": 3.9729723379477926e-05, "loss": 0.0524, "step": 13380 }, { "epoch": 3.503500163559045, "grad_norm": 0.6079171895980835, "learning_rate": 3.9713018399628356e-05, "loss": 0.0522, "step": 13390 }, { "epoch": 3.506117108276088, "grad_norm": 0.4987918436527252, "learning_rate": 3.969630336356945e-05, "loss": 0.0555, "step": 13400 }, { "epoch": 3.5087340529931303, "grad_norm": 0.4886915683746338, "learning_rate": 3.967957828272577e-05, "loss": 0.0552, "step": 13410 }, { "epoch": 3.5113509977101733, "grad_norm": 0.48263683915138245, "learning_rate": 3.966284316852876e-05, "loss": 0.0615, "step": 13420 }, { "epoch": 3.5139679424272163, "grad_norm": 0.6995543837547302, "learning_rate": 3.9646098032416704e-05, "loss": 0.0564, "step": 13430 }, { "epoch": 3.5165848871442593, "grad_norm": 0.7016103267669678, "learning_rate": 3.962934288583474e-05, "loss": 0.0597, "step": 13440 }, { "epoch": 3.519201831861302, "grad_norm": 0.4052731692790985, "learning_rate": 3.961257774023487e-05, "loss": 0.0504, "step": 13450 }, { "epoch": 3.521818776578345, "grad_norm": 0.42247992753982544, "learning_rate": 3.9595802607075896e-05, "loss": 0.0539, "step": 13460 }, { "epoch": 3.5244357212953874, "grad_norm": 0.626800537109375, "learning_rate": 3.957901749782347e-05, "loss": 0.051, "step": 13470 }, { "epoch": 3.5270526660124304, "grad_norm": 0.6743230819702148, "learning_rate": 3.9562222423950065e-05, "loss": 0.0625, "step": 13480 }, { "epoch": 3.5296696107294734, "grad_norm": 0.8379527926445007, "learning_rate": 3.9545417396934936e-05, "loss": 0.0522, "step": 13490 }, { "epoch": 3.5322865554465164, "grad_norm": 0.5695173144340515, "learning_rate": 3.952860242826418e-05, "loss": 0.0549, "step": 13500 }, { "epoch": 3.534903500163559, "grad_norm": 0.5186528563499451, "learning_rate": 3.951177752943066e-05, "loss": 0.0581, "step": 13510 }, { "epoch": 3.537520444880602, "grad_norm": 0.5912035703659058, "learning_rate": 3.9494942711934026e-05, "loss": 0.0478, "step": 13520 }, { "epoch": 3.540137389597645, "grad_norm": 0.33838945627212524, "learning_rate": 3.9478097987280735e-05, "loss": 0.0512, "step": 13530 }, { "epoch": 3.5427543343146874, "grad_norm": 0.3433888554573059, "learning_rate": 3.946124336698399e-05, "loss": 0.0522, "step": 13540 }, { "epoch": 3.5453712790317304, "grad_norm": 0.45829489827156067, "learning_rate": 3.944437886256377e-05, "loss": 0.0579, "step": 13550 }, { "epoch": 3.5479882237487734, "grad_norm": 0.7708325982093811, "learning_rate": 3.9427504485546796e-05, "loss": 0.0546, "step": 13560 }, { "epoch": 3.5506051684658164, "grad_norm": 0.5576505064964294, "learning_rate": 3.9410620247466544e-05, "loss": 0.0572, "step": 13570 }, { "epoch": 3.553222113182859, "grad_norm": 0.5772538185119629, "learning_rate": 3.9393726159863245e-05, "loss": 0.0572, "step": 13580 }, { "epoch": 3.555839057899902, "grad_norm": 0.6166955828666687, "learning_rate": 3.937682223428383e-05, "loss": 0.0548, "step": 13590 }, { "epoch": 3.5584560026169445, "grad_norm": 0.535210907459259, "learning_rate": 3.935990848228199e-05, "loss": 0.0594, "step": 13600 }, { "epoch": 3.5610729473339875, "grad_norm": 0.6279690861701965, "learning_rate": 3.9342984915418114e-05, "loss": 0.0516, "step": 13610 }, { "epoch": 3.5636898920510305, "grad_norm": 0.5341526865959167, "learning_rate": 3.932605154525929e-05, "loss": 0.0489, "step": 13620 }, { "epoch": 3.5663068367680735, "grad_norm": 0.5120111703872681, "learning_rate": 3.930910838337932e-05, "loss": 0.0529, "step": 13630 }, { "epoch": 3.568923781485116, "grad_norm": 0.4864059090614319, "learning_rate": 3.9292155441358694e-05, "loss": 0.0521, "step": 13640 }, { "epoch": 3.571540726202159, "grad_norm": 0.5746162533760071, "learning_rate": 3.927519273078459e-05, "loss": 0.0583, "step": 13650 }, { "epoch": 3.5741576709192016, "grad_norm": 0.7206608653068542, "learning_rate": 3.9258220263250865e-05, "loss": 0.0519, "step": 13660 }, { "epoch": 3.5767746156362445, "grad_norm": 0.6371074318885803, "learning_rate": 3.9241238050358044e-05, "loss": 0.0539, "step": 13670 }, { "epoch": 3.5793915603532875, "grad_norm": 0.5628343820571899, "learning_rate": 3.922424610371329e-05, "loss": 0.0626, "step": 13680 }, { "epoch": 3.5820085050703305, "grad_norm": 0.4270327389240265, "learning_rate": 3.920724443493046e-05, "loss": 0.0588, "step": 13690 }, { "epoch": 3.5846254497873735, "grad_norm": 0.4563710689544678, "learning_rate": 3.919023305563002e-05, "loss": 0.0561, "step": 13700 }, { "epoch": 3.587242394504416, "grad_norm": 0.9741207361221313, "learning_rate": 3.9173211977439094e-05, "loss": 0.0555, "step": 13710 }, { "epoch": 3.589859339221459, "grad_norm": 0.7190433144569397, "learning_rate": 3.9156181211991426e-05, "loss": 0.0527, "step": 13720 }, { "epoch": 3.5924762839385016, "grad_norm": 0.697902262210846, "learning_rate": 3.9139140770927385e-05, "loss": 0.0495, "step": 13730 }, { "epoch": 3.5950932286555446, "grad_norm": 0.5454297661781311, "learning_rate": 3.912209066589395e-05, "loss": 0.0533, "step": 13740 }, { "epoch": 3.5977101733725876, "grad_norm": 0.5934323668479919, "learning_rate": 3.910503090854472e-05, "loss": 0.0597, "step": 13750 }, { "epoch": 3.6003271180896306, "grad_norm": 0.44584307074546814, "learning_rate": 3.908796151053985e-05, "loss": 0.0568, "step": 13760 }, { "epoch": 3.602944062806673, "grad_norm": 0.5419604778289795, "learning_rate": 3.9070882483546135e-05, "loss": 0.051, "step": 13770 }, { "epoch": 3.605561007523716, "grad_norm": 0.3984220623970032, "learning_rate": 3.905379383923693e-05, "loss": 0.0512, "step": 13780 }, { "epoch": 3.6081779522407587, "grad_norm": 0.4912005364894867, "learning_rate": 3.9036695589292136e-05, "loss": 0.0518, "step": 13790 }, { "epoch": 3.6107948969578016, "grad_norm": 0.39835718274116516, "learning_rate": 3.9019587745398276e-05, "loss": 0.0499, "step": 13800 }, { "epoch": 3.6134118416748446, "grad_norm": 0.6596834659576416, "learning_rate": 3.9002470319248394e-05, "loss": 0.0543, "step": 13810 }, { "epoch": 3.6160287863918876, "grad_norm": 0.7677016854286194, "learning_rate": 3.898534332254208e-05, "loss": 0.0503, "step": 13820 }, { "epoch": 3.61864573110893, "grad_norm": 0.4970107674598694, "learning_rate": 3.896820676698548e-05, "loss": 0.0598, "step": 13830 }, { "epoch": 3.621262675825973, "grad_norm": 0.709835946559906, "learning_rate": 3.8951060664291265e-05, "loss": 0.0587, "step": 13840 }, { "epoch": 3.623879620543016, "grad_norm": 0.610241711139679, "learning_rate": 3.893390502617864e-05, "loss": 0.0533, "step": 13850 }, { "epoch": 3.6264965652600587, "grad_norm": 0.7120442986488342, "learning_rate": 3.891673986437331e-05, "loss": 0.0496, "step": 13860 }, { "epoch": 3.6291135099771017, "grad_norm": 0.6501908302307129, "learning_rate": 3.889956519060752e-05, "loss": 0.0548, "step": 13870 }, { "epoch": 3.6317304546941447, "grad_norm": 0.4381459653377533, "learning_rate": 3.8882381016619986e-05, "loss": 0.0565, "step": 13880 }, { "epoch": 3.6343473994111877, "grad_norm": 0.49421507120132446, "learning_rate": 3.886518735415593e-05, "loss": 0.0547, "step": 13890 }, { "epoch": 3.63696434412823, "grad_norm": 0.5255186557769775, "learning_rate": 3.884798421496705e-05, "loss": 0.0537, "step": 13900 }, { "epoch": 3.639581288845273, "grad_norm": 0.8173955082893372, "learning_rate": 3.883077161081155e-05, "loss": 0.0552, "step": 13910 }, { "epoch": 3.6421982335623158, "grad_norm": 0.5856022834777832, "learning_rate": 3.8813549553454056e-05, "loss": 0.0532, "step": 13920 }, { "epoch": 3.6448151782793587, "grad_norm": 0.4360755980014801, "learning_rate": 3.8796318054665706e-05, "loss": 0.0504, "step": 13930 }, { "epoch": 3.6474321229964017, "grad_norm": 0.35014355182647705, "learning_rate": 3.877907712622406e-05, "loss": 0.0464, "step": 13940 }, { "epoch": 3.6500490677134447, "grad_norm": 0.5001941919326782, "learning_rate": 3.876182677991312e-05, "loss": 0.0575, "step": 13950 }, { "epoch": 3.6526660124304873, "grad_norm": 0.29256579279899597, "learning_rate": 3.874456702752334e-05, "loss": 0.0547, "step": 13960 }, { "epoch": 3.6552829571475303, "grad_norm": 0.4693641662597656, "learning_rate": 3.872729788085161e-05, "loss": 0.0558, "step": 13970 }, { "epoch": 3.6578999018645733, "grad_norm": 0.40088728070259094, "learning_rate": 3.871001935170121e-05, "loss": 0.0521, "step": 13980 }, { "epoch": 3.660516846581616, "grad_norm": 0.4256943464279175, "learning_rate": 3.869273145188187e-05, "loss": 0.0523, "step": 13990 }, { "epoch": 3.663133791298659, "grad_norm": 0.445076584815979, "learning_rate": 3.8675434193209684e-05, "loss": 0.0507, "step": 14000 }, { "epoch": 3.663133791298659, "eval_loss": 0.05693136046282009, "eval_runtime": 9.1176, "eval_samples_per_second": 112.31, "eval_steps_per_second": 1.755, "step": 14000 }, { "epoch": 3.665750736015702, "grad_norm": 0.37255793809890747, "learning_rate": 3.8658127587507184e-05, "loss": 0.0531, "step": 14010 }, { "epoch": 3.6683676807327448, "grad_norm": 0.5347610116004944, "learning_rate": 3.8640811646603276e-05, "loss": 0.0559, "step": 14020 }, { "epoch": 3.6709846254497873, "grad_norm": 0.2988486886024475, "learning_rate": 3.8623486382333226e-05, "loss": 0.0518, "step": 14030 }, { "epoch": 3.6736015701668303, "grad_norm": 0.5495784282684326, "learning_rate": 3.860615180653869e-05, "loss": 0.0499, "step": 14040 }, { "epoch": 3.676218514883873, "grad_norm": 0.5568211674690247, "learning_rate": 3.85888079310677e-05, "loss": 0.0556, "step": 14050 }, { "epoch": 3.678835459600916, "grad_norm": 0.672922670841217, "learning_rate": 3.857145476777463e-05, "loss": 0.0541, "step": 14060 }, { "epoch": 3.681452404317959, "grad_norm": 0.6420120596885681, "learning_rate": 3.8554092328520186e-05, "loss": 0.0474, "step": 14070 }, { "epoch": 3.684069349035002, "grad_norm": 0.508482813835144, "learning_rate": 3.853672062517144e-05, "loss": 0.0455, "step": 14080 }, { "epoch": 3.6866862937520444, "grad_norm": 0.36084631085395813, "learning_rate": 3.8519339669601794e-05, "loss": 0.0492, "step": 14090 }, { "epoch": 3.6893032384690874, "grad_norm": 0.45002564787864685, "learning_rate": 3.850194947369097e-05, "loss": 0.0512, "step": 14100 }, { "epoch": 3.6919201831861304, "grad_norm": 0.36685696244239807, "learning_rate": 3.8484550049324996e-05, "loss": 0.0531, "step": 14110 }, { "epoch": 3.694537127903173, "grad_norm": 0.48534664511680603, "learning_rate": 3.8467141408396206e-05, "loss": 0.0583, "step": 14120 }, { "epoch": 3.697154072620216, "grad_norm": 0.8050063848495483, "learning_rate": 3.844972356280326e-05, "loss": 0.0559, "step": 14130 }, { "epoch": 3.699771017337259, "grad_norm": 0.6816076040267944, "learning_rate": 3.843229652445107e-05, "loss": 0.0509, "step": 14140 }, { "epoch": 3.702387962054302, "grad_norm": 0.5003925561904907, "learning_rate": 3.8414860305250875e-05, "loss": 0.0472, "step": 14150 }, { "epoch": 3.7050049067713444, "grad_norm": 0.5868620872497559, "learning_rate": 3.839741491712016e-05, "loss": 0.0518, "step": 14160 }, { "epoch": 3.7076218514883874, "grad_norm": 0.6365765929222107, "learning_rate": 3.837996037198267e-05, "loss": 0.0515, "step": 14170 }, { "epoch": 3.71023879620543, "grad_norm": 0.40195780992507935, "learning_rate": 3.836249668176844e-05, "loss": 0.0498, "step": 14180 }, { "epoch": 3.712855740922473, "grad_norm": 0.4129416346549988, "learning_rate": 3.834502385841372e-05, "loss": 0.0529, "step": 14190 }, { "epoch": 3.715472685639516, "grad_norm": 0.41690176725387573, "learning_rate": 3.832754191386103e-05, "loss": 0.0527, "step": 14200 }, { "epoch": 3.718089630356559, "grad_norm": 0.6315004825592041, "learning_rate": 3.831005086005912e-05, "loss": 0.0545, "step": 14210 }, { "epoch": 3.7207065750736015, "grad_norm": 0.49347928166389465, "learning_rate": 3.829255070896294e-05, "loss": 0.0504, "step": 14220 }, { "epoch": 3.7233235197906445, "grad_norm": 0.625518798828125, "learning_rate": 3.82750414725337e-05, "loss": 0.0556, "step": 14230 }, { "epoch": 3.725940464507687, "grad_norm": 0.6610779166221619, "learning_rate": 3.8257523162738794e-05, "loss": 0.0531, "step": 14240 }, { "epoch": 3.72855740922473, "grad_norm": 0.5094367265701294, "learning_rate": 3.823999579155182e-05, "loss": 0.0568, "step": 14250 }, { "epoch": 3.731174353941773, "grad_norm": 0.7669066786766052, "learning_rate": 3.822245937095256e-05, "loss": 0.0522, "step": 14260 }, { "epoch": 3.733791298658816, "grad_norm": 0.7019723057746887, "learning_rate": 3.8204913912927e-05, "loss": 0.0549, "step": 14270 }, { "epoch": 3.7364082433758585, "grad_norm": 0.5560470223426819, "learning_rate": 3.8187359429467294e-05, "loss": 0.0525, "step": 14280 }, { "epoch": 3.7390251880929015, "grad_norm": 0.6931577324867249, "learning_rate": 3.816979593257177e-05, "loss": 0.0481, "step": 14290 }, { "epoch": 3.7416421328099445, "grad_norm": 0.6408355236053467, "learning_rate": 3.815222343424492e-05, "loss": 0.0502, "step": 14300 }, { "epoch": 3.744259077526987, "grad_norm": 0.6335650086402893, "learning_rate": 3.8134641946497354e-05, "loss": 0.0547, "step": 14310 }, { "epoch": 3.74687602224403, "grad_norm": 0.6570053100585938, "learning_rate": 3.811705148134587e-05, "loss": 0.054, "step": 14320 }, { "epoch": 3.749492966961073, "grad_norm": 0.4380761981010437, "learning_rate": 3.80994520508134e-05, "loss": 0.0495, "step": 14330 }, { "epoch": 3.752109911678116, "grad_norm": 0.4016902446746826, "learning_rate": 3.8081843666928965e-05, "loss": 0.0581, "step": 14340 }, { "epoch": 3.7547268563951586, "grad_norm": 0.5980265736579895, "learning_rate": 3.8064226341727736e-05, "loss": 0.0507, "step": 14350 }, { "epoch": 3.7573438011122016, "grad_norm": 0.44559043645858765, "learning_rate": 3.8046600087250996e-05, "loss": 0.0506, "step": 14360 }, { "epoch": 3.759960745829244, "grad_norm": 0.45184126496315, "learning_rate": 3.802896491554611e-05, "loss": 0.0508, "step": 14370 }, { "epoch": 3.762577690546287, "grad_norm": 0.7190099954605103, "learning_rate": 3.801132083866657e-05, "loss": 0.0505, "step": 14380 }, { "epoch": 3.76519463526333, "grad_norm": 0.5304298996925354, "learning_rate": 3.799366786867192e-05, "loss": 0.0589, "step": 14390 }, { "epoch": 3.767811579980373, "grad_norm": 0.4039107859134674, "learning_rate": 3.7976006017627806e-05, "loss": 0.0546, "step": 14400 }, { "epoch": 3.7704285246974156, "grad_norm": 0.972225546836853, "learning_rate": 3.7958335297605935e-05, "loss": 0.0505, "step": 14410 }, { "epoch": 3.7730454694144586, "grad_norm": 0.6542357206344604, "learning_rate": 3.7940655720684076e-05, "loss": 0.0467, "step": 14420 }, { "epoch": 3.7756624141315016, "grad_norm": 0.7045339941978455, "learning_rate": 3.792296729894606e-05, "loss": 0.051, "step": 14430 }, { "epoch": 3.778279358848544, "grad_norm": 0.348215252161026, "learning_rate": 3.790527004448175e-05, "loss": 0.0567, "step": 14440 }, { "epoch": 3.780896303565587, "grad_norm": 0.7830608487129211, "learning_rate": 3.788756396938705e-05, "loss": 0.0511, "step": 14450 }, { "epoch": 3.78351324828263, "grad_norm": 0.4746474623680115, "learning_rate": 3.786984908576391e-05, "loss": 0.0481, "step": 14460 }, { "epoch": 3.786130192999673, "grad_norm": 0.5564001798629761, "learning_rate": 3.785212540572026e-05, "loss": 0.0547, "step": 14470 }, { "epoch": 3.7887471377167157, "grad_norm": 0.4673481583595276, "learning_rate": 3.78343929413701e-05, "loss": 0.0441, "step": 14480 }, { "epoch": 3.7913640824337587, "grad_norm": 0.43412885069847107, "learning_rate": 3.7816651704833374e-05, "loss": 0.0521, "step": 14490 }, { "epoch": 3.793981027150801, "grad_norm": 0.4434505105018616, "learning_rate": 3.779890170823606e-05, "loss": 0.0576, "step": 14500 }, { "epoch": 3.796597971867844, "grad_norm": 0.6474137306213379, "learning_rate": 3.778114296371013e-05, "loss": 0.0571, "step": 14510 }, { "epoch": 3.799214916584887, "grad_norm": 0.66424560546875, "learning_rate": 3.776337548339348e-05, "loss": 0.0469, "step": 14520 }, { "epoch": 3.80183186130193, "grad_norm": 0.4851696193218231, "learning_rate": 3.774559927943006e-05, "loss": 0.0532, "step": 14530 }, { "epoch": 3.8044488060189727, "grad_norm": 0.3956303894519806, "learning_rate": 3.7727814363969705e-05, "loss": 0.0463, "step": 14540 }, { "epoch": 3.8070657507360157, "grad_norm": 0.5008952021598816, "learning_rate": 3.771002074916824e-05, "loss": 0.06, "step": 14550 }, { "epoch": 3.8096826954530587, "grad_norm": 0.4984845221042633, "learning_rate": 3.769221844718746e-05, "loss": 0.0543, "step": 14560 }, { "epoch": 3.8122996401701013, "grad_norm": 0.5751848220825195, "learning_rate": 3.767440747019505e-05, "loss": 0.0501, "step": 14570 }, { "epoch": 3.8149165848871442, "grad_norm": 0.42314207553863525, "learning_rate": 3.7656587830364646e-05, "loss": 0.0484, "step": 14580 }, { "epoch": 3.8175335296041872, "grad_norm": 0.5318662524223328, "learning_rate": 3.763875953987579e-05, "loss": 0.0504, "step": 14590 }, { "epoch": 3.8201504743212302, "grad_norm": 0.43563997745513916, "learning_rate": 3.7620922610913966e-05, "loss": 0.0517, "step": 14600 }, { "epoch": 3.8227674190382728, "grad_norm": 0.5153934359550476, "learning_rate": 3.760307705567056e-05, "loss": 0.0511, "step": 14610 }, { "epoch": 3.8253843637553158, "grad_norm": 0.49038711190223694, "learning_rate": 3.758522288634282e-05, "loss": 0.0518, "step": 14620 }, { "epoch": 3.8280013084723583, "grad_norm": 0.6636834740638733, "learning_rate": 3.756736011513391e-05, "loss": 0.0497, "step": 14630 }, { "epoch": 3.8306182531894013, "grad_norm": 0.8136001825332642, "learning_rate": 3.754948875425286e-05, "loss": 0.0536, "step": 14640 }, { "epoch": 3.8332351979064443, "grad_norm": 0.4622998535633087, "learning_rate": 3.753160881591459e-05, "loss": 0.0484, "step": 14650 }, { "epoch": 3.8358521426234873, "grad_norm": 0.4776436388492584, "learning_rate": 3.751372031233985e-05, "loss": 0.0497, "step": 14660 }, { "epoch": 3.83846908734053, "grad_norm": 0.3973548114299774, "learning_rate": 3.749582325575528e-05, "loss": 0.0537, "step": 14670 }, { "epoch": 3.841086032057573, "grad_norm": 0.4959774613380432, "learning_rate": 3.7477917658393345e-05, "loss": 0.0514, "step": 14680 }, { "epoch": 3.8437029767746154, "grad_norm": 0.28917068243026733, "learning_rate": 3.746000353249234e-05, "loss": 0.0452, "step": 14690 }, { "epoch": 3.8463199214916584, "grad_norm": 0.44032183289527893, "learning_rate": 3.744208089029642e-05, "loss": 0.0564, "step": 14700 }, { "epoch": 3.8489368662087013, "grad_norm": 0.5193437933921814, "learning_rate": 3.7424149744055534e-05, "loss": 0.0511, "step": 14710 }, { "epoch": 3.8515538109257443, "grad_norm": 0.46186214685440063, "learning_rate": 3.740621010602545e-05, "loss": 0.054, "step": 14720 }, { "epoch": 3.854170755642787, "grad_norm": 0.5270763635635376, "learning_rate": 3.7388261988467747e-05, "loss": 0.0504, "step": 14730 }, { "epoch": 3.85678770035983, "grad_norm": 0.4619028866291046, "learning_rate": 3.7370305403649774e-05, "loss": 0.0518, "step": 14740 }, { "epoch": 3.859404645076873, "grad_norm": 0.4640951156616211, "learning_rate": 3.7352340363844704e-05, "loss": 0.0498, "step": 14750 }, { "epoch": 3.8620215897939154, "grad_norm": 0.3832848370075226, "learning_rate": 3.7334366881331486e-05, "loss": 0.052, "step": 14760 }, { "epoch": 3.8646385345109584, "grad_norm": 0.5284420847892761, "learning_rate": 3.73163849683948e-05, "loss": 0.0509, "step": 14770 }, { "epoch": 3.8672554792280014, "grad_norm": 0.5752606987953186, "learning_rate": 3.729839463732513e-05, "loss": 0.0513, "step": 14780 }, { "epoch": 3.8698724239450444, "grad_norm": 0.5546534657478333, "learning_rate": 3.7280395900418685e-05, "loss": 0.0489, "step": 14790 }, { "epoch": 3.872489368662087, "grad_norm": 0.5715247392654419, "learning_rate": 3.726238876997744e-05, "loss": 0.0533, "step": 14800 }, { "epoch": 3.87510631337913, "grad_norm": 0.3901219666004181, "learning_rate": 3.724437325830911e-05, "loss": 0.0513, "step": 14810 }, { "epoch": 3.8777232580961725, "grad_norm": 0.4849676489830017, "learning_rate": 3.722634937772711e-05, "loss": 0.0457, "step": 14820 }, { "epoch": 3.8803402028132155, "grad_norm": 0.3245883285999298, "learning_rate": 3.72083171405506e-05, "loss": 0.0547, "step": 14830 }, { "epoch": 3.8829571475302584, "grad_norm": 0.5932340621948242, "learning_rate": 3.719027655910443e-05, "loss": 0.0508, "step": 14840 }, { "epoch": 3.8855740922473014, "grad_norm": 0.36464598774909973, "learning_rate": 3.7172227645719186e-05, "loss": 0.0499, "step": 14850 }, { "epoch": 3.888191036964344, "grad_norm": 0.4754194915294647, "learning_rate": 3.7154170412731124e-05, "loss": 0.0532, "step": 14860 }, { "epoch": 3.890807981681387, "grad_norm": 0.4943714439868927, "learning_rate": 3.713610487248219e-05, "loss": 0.0511, "step": 14870 }, { "epoch": 3.89342492639843, "grad_norm": 0.7925359606742859, "learning_rate": 3.7118031037320025e-05, "loss": 0.0566, "step": 14880 }, { "epoch": 3.8960418711154725, "grad_norm": 0.43726474046707153, "learning_rate": 3.709994891959789e-05, "loss": 0.0457, "step": 14890 }, { "epoch": 3.8986588158325155, "grad_norm": 0.4101634621620178, "learning_rate": 3.708185853167478e-05, "loss": 0.0555, "step": 14900 }, { "epoch": 3.9012757605495585, "grad_norm": 0.7545664310455322, "learning_rate": 3.706375988591528e-05, "loss": 0.0596, "step": 14910 }, { "epoch": 3.9038927052666015, "grad_norm": 0.5561701059341431, "learning_rate": 3.704565299468966e-05, "loss": 0.0468, "step": 14920 }, { "epoch": 3.906509649983644, "grad_norm": 0.5525078177452087, "learning_rate": 3.70275378703738e-05, "loss": 0.0542, "step": 14930 }, { "epoch": 3.909126594700687, "grad_norm": 0.45929309725761414, "learning_rate": 3.700941452534922e-05, "loss": 0.0586, "step": 14940 }, { "epoch": 3.9117435394177296, "grad_norm": 0.6954630017280579, "learning_rate": 3.699128297200305e-05, "loss": 0.0459, "step": 14950 }, { "epoch": 3.9143604841347726, "grad_norm": 0.4128349721431732, "learning_rate": 3.697314322272804e-05, "loss": 0.0478, "step": 14960 }, { "epoch": 3.9169774288518155, "grad_norm": 0.576116681098938, "learning_rate": 3.695499528992253e-05, "loss": 0.0528, "step": 14970 }, { "epoch": 3.9195943735688585, "grad_norm": 0.7343944907188416, "learning_rate": 3.693683918599049e-05, "loss": 0.0554, "step": 14980 }, { "epoch": 3.922211318285901, "grad_norm": 0.4868308901786804, "learning_rate": 3.6918674923341405e-05, "loss": 0.0498, "step": 14990 }, { "epoch": 3.924828263002944, "grad_norm": 0.6821767091751099, "learning_rate": 3.69005025143904e-05, "loss": 0.0512, "step": 15000 }, { "epoch": 3.924828263002944, "eval_loss": 0.05896398778761369, "eval_runtime": 8.8799, "eval_samples_per_second": 115.317, "eval_steps_per_second": 1.802, "step": 15000 }, { "epoch": 3.927445207719987, "grad_norm": 0.43927785754203796, "learning_rate": 3.688232197155814e-05, "loss": 0.0546, "step": 15010 }, { "epoch": 3.9300621524370296, "grad_norm": 0.48438242077827454, "learning_rate": 3.686413330727086e-05, "loss": 0.0515, "step": 15020 }, { "epoch": 3.9326790971540726, "grad_norm": 0.43656083941459656, "learning_rate": 3.684593653396034e-05, "loss": 0.0491, "step": 15030 }, { "epoch": 3.9352960418711156, "grad_norm": 0.5122218728065491, "learning_rate": 3.6827731664063895e-05, "loss": 0.0475, "step": 15040 }, { "epoch": 3.9379129865881586, "grad_norm": 0.6488099694252014, "learning_rate": 3.680951871002438e-05, "loss": 0.052, "step": 15050 }, { "epoch": 3.940529931305201, "grad_norm": 0.7116113305091858, "learning_rate": 3.6791297684290196e-05, "loss": 0.0554, "step": 15060 }, { "epoch": 3.943146876022244, "grad_norm": 0.5660650134086609, "learning_rate": 3.677306859931522e-05, "loss": 0.0529, "step": 15070 }, { "epoch": 3.9457638207392867, "grad_norm": 0.5480781197547913, "learning_rate": 3.675483146755888e-05, "loss": 0.0591, "step": 15080 }, { "epoch": 3.9483807654563297, "grad_norm": 0.6030336618423462, "learning_rate": 3.673658630148606e-05, "loss": 0.0571, "step": 15090 }, { "epoch": 3.9509977101733726, "grad_norm": 0.6756324768066406, "learning_rate": 3.671833311356718e-05, "loss": 0.0587, "step": 15100 }, { "epoch": 3.9536146548904156, "grad_norm": 0.565447986125946, "learning_rate": 3.670007191627812e-05, "loss": 0.049, "step": 15110 }, { "epoch": 3.956231599607458, "grad_norm": 0.8143734335899353, "learning_rate": 3.668180272210022e-05, "loss": 0.0555, "step": 15120 }, { "epoch": 3.958848544324501, "grad_norm": 0.5056126117706299, "learning_rate": 3.666352554352032e-05, "loss": 0.0488, "step": 15130 }, { "epoch": 3.961465489041544, "grad_norm": 0.6878169178962708, "learning_rate": 3.664524039303069e-05, "loss": 0.0514, "step": 15140 }, { "epoch": 3.9640824337585867, "grad_norm": 0.5267339944839478, "learning_rate": 3.662694728312905e-05, "loss": 0.0513, "step": 15150 }, { "epoch": 3.9666993784756297, "grad_norm": 0.36863964796066284, "learning_rate": 3.660864622631859e-05, "loss": 0.0463, "step": 15160 }, { "epoch": 3.9693163231926727, "grad_norm": 0.4979141652584076, "learning_rate": 3.659033723510789e-05, "loss": 0.052, "step": 15170 }, { "epoch": 3.9719332679097157, "grad_norm": 0.34969276189804077, "learning_rate": 3.657202032201099e-05, "loss": 0.0461, "step": 15180 }, { "epoch": 3.9745502126267582, "grad_norm": 0.31592005491256714, "learning_rate": 3.6553695499547305e-05, "loss": 0.0529, "step": 15190 }, { "epoch": 3.977167157343801, "grad_norm": 0.5293835997581482, "learning_rate": 3.6535362780241694e-05, "loss": 0.0541, "step": 15200 }, { "epoch": 3.9797841020608438, "grad_norm": 0.6030070781707764, "learning_rate": 3.65170221766244e-05, "loss": 0.0514, "step": 15210 }, { "epoch": 3.9824010467778868, "grad_norm": 0.53548264503479, "learning_rate": 3.649867370123104e-05, "loss": 0.0524, "step": 15220 }, { "epoch": 3.9850179914949297, "grad_norm": 0.585054337978363, "learning_rate": 3.648031736660264e-05, "loss": 0.0482, "step": 15230 }, { "epoch": 3.9876349362119727, "grad_norm": 0.6449889540672302, "learning_rate": 3.6461953185285566e-05, "loss": 0.0509, "step": 15240 }, { "epoch": 3.9902518809290153, "grad_norm": 0.5677632689476013, "learning_rate": 3.644358116983157e-05, "loss": 0.0516, "step": 15250 }, { "epoch": 3.9928688256460583, "grad_norm": 0.49215826392173767, "learning_rate": 3.6425201332797755e-05, "loss": 0.0505, "step": 15260 }, { "epoch": 3.995485770363101, "grad_norm": 0.6127063632011414, "learning_rate": 3.640681368674656e-05, "loss": 0.0531, "step": 15270 }, { "epoch": 3.998102715080144, "grad_norm": 0.6169252395629883, "learning_rate": 3.638841824424577e-05, "loss": 0.0519, "step": 15280 }, { "epoch": 4.000523388943408, "grad_norm": 0.491829514503479, "learning_rate": 3.63700150178685e-05, "loss": 0.0493, "step": 15290 }, { "epoch": 4.003140333660451, "grad_norm": 0.5655731558799744, "learning_rate": 3.635160402019317e-05, "loss": 0.048, "step": 15300 }, { "epoch": 4.005757278377494, "grad_norm": 0.6087900400161743, "learning_rate": 3.633318526380354e-05, "loss": 0.0535, "step": 15310 }, { "epoch": 4.008374223094537, "grad_norm": 0.5320557355880737, "learning_rate": 3.631475876128864e-05, "loss": 0.0396, "step": 15320 }, { "epoch": 4.01099116781158, "grad_norm": 0.44770991802215576, "learning_rate": 3.629632452524282e-05, "loss": 0.0434, "step": 15330 }, { "epoch": 4.0136081125286225, "grad_norm": 0.5166875123977661, "learning_rate": 3.627788256826571e-05, "loss": 0.0506, "step": 15340 }, { "epoch": 4.0162250572456655, "grad_norm": 0.6423705816268921, "learning_rate": 3.6259432902962195e-05, "loss": 0.0537, "step": 15350 }, { "epoch": 4.0188420019627085, "grad_norm": 0.62786465883255, "learning_rate": 3.624097554194248e-05, "loss": 0.0527, "step": 15360 }, { "epoch": 4.0214589466797515, "grad_norm": 0.6154875755310059, "learning_rate": 3.622251049782197e-05, "loss": 0.0542, "step": 15370 }, { "epoch": 4.0240758913967944, "grad_norm": 0.5390315651893616, "learning_rate": 3.6204037783221356e-05, "loss": 0.0504, "step": 15380 }, { "epoch": 4.026692836113837, "grad_norm": 0.5157982110977173, "learning_rate": 3.618555741076657e-05, "loss": 0.0549, "step": 15390 }, { "epoch": 4.0293097808308795, "grad_norm": 0.5032212138175964, "learning_rate": 3.6167069393088756e-05, "loss": 0.0515, "step": 15400 }, { "epoch": 4.0319267255479225, "grad_norm": 0.5536220073699951, "learning_rate": 3.614857374282432e-05, "loss": 0.0449, "step": 15410 }, { "epoch": 4.0345436702649655, "grad_norm": 0.41552475094795227, "learning_rate": 3.613007047261485e-05, "loss": 0.0522, "step": 15420 }, { "epoch": 4.0371606149820085, "grad_norm": 0.45113885402679443, "learning_rate": 3.6111559595107164e-05, "loss": 0.0508, "step": 15430 }, { "epoch": 4.0397775596990515, "grad_norm": 0.6253324151039124, "learning_rate": 3.609304112295328e-05, "loss": 0.0547, "step": 15440 }, { "epoch": 4.0423945044160945, "grad_norm": 0.5622521042823792, "learning_rate": 3.607451506881037e-05, "loss": 0.0471, "step": 15450 }, { "epoch": 4.0450114491331375, "grad_norm": 0.346451997756958, "learning_rate": 3.605598144534085e-05, "loss": 0.051, "step": 15460 }, { "epoch": 4.04762839385018, "grad_norm": 0.5430278182029724, "learning_rate": 3.603744026521227e-05, "loss": 0.0432, "step": 15470 }, { "epoch": 4.050245338567223, "grad_norm": 0.5486944317817688, "learning_rate": 3.6018891541097336e-05, "loss": 0.0515, "step": 15480 }, { "epoch": 4.052862283284266, "grad_norm": 0.4349093735218048, "learning_rate": 3.6000335285673934e-05, "loss": 0.054, "step": 15490 }, { "epoch": 4.055479228001309, "grad_norm": 0.6662614941596985, "learning_rate": 3.5981771511625094e-05, "loss": 0.0527, "step": 15500 }, { "epoch": 4.0580961727183515, "grad_norm": 0.5280335545539856, "learning_rate": 3.5963200231638976e-05, "loss": 0.0522, "step": 15510 }, { "epoch": 4.0607131174353945, "grad_norm": 0.46068698167800903, "learning_rate": 3.5944621458408883e-05, "loss": 0.0523, "step": 15520 }, { "epoch": 4.063330062152437, "grad_norm": 0.4579375386238098, "learning_rate": 3.5926035204633216e-05, "loss": 0.0476, "step": 15530 }, { "epoch": 4.06594700686948, "grad_norm": 0.6499910354614258, "learning_rate": 3.590744148301552e-05, "loss": 0.0523, "step": 15540 }, { "epoch": 4.068563951586523, "grad_norm": 0.5918278694152832, "learning_rate": 3.5888840306264424e-05, "loss": 0.0553, "step": 15550 }, { "epoch": 4.071180896303566, "grad_norm": 0.5414620041847229, "learning_rate": 3.5870231687093644e-05, "loss": 0.0538, "step": 15560 }, { "epoch": 4.073797841020609, "grad_norm": 0.6221410632133484, "learning_rate": 3.5851615638222014e-05, "loss": 0.0503, "step": 15570 }, { "epoch": 4.076414785737652, "grad_norm": 0.5084514617919922, "learning_rate": 3.583299217237341e-05, "loss": 0.0543, "step": 15580 }, { "epoch": 4.079031730454695, "grad_norm": 0.44841188192367554, "learning_rate": 3.581436130227682e-05, "loss": 0.045, "step": 15590 }, { "epoch": 4.081648675171737, "grad_norm": 0.43916547298431396, "learning_rate": 3.579572304066624e-05, "loss": 0.0475, "step": 15600 }, { "epoch": 4.08426561988878, "grad_norm": 0.4475759267807007, "learning_rate": 3.5777077400280765e-05, "loss": 0.0462, "step": 15610 }, { "epoch": 4.086882564605823, "grad_norm": 0.6395149827003479, "learning_rate": 3.575842439386451e-05, "loss": 0.0496, "step": 15620 }, { "epoch": 4.089499509322866, "grad_norm": 0.5050554275512695, "learning_rate": 3.573976403416662e-05, "loss": 0.0456, "step": 15630 }, { "epoch": 4.092116454039909, "grad_norm": 0.5360559821128845, "learning_rate": 3.57210963339413e-05, "loss": 0.0504, "step": 15640 }, { "epoch": 4.094733398756952, "grad_norm": 0.5057238936424255, "learning_rate": 3.5702421305947714e-05, "loss": 0.0528, "step": 15650 }, { "epoch": 4.097350343473994, "grad_norm": 0.4850975573062897, "learning_rate": 3.5683738962950086e-05, "loss": 0.0424, "step": 15660 }, { "epoch": 4.099967288191037, "grad_norm": 0.6539948582649231, "learning_rate": 3.566504931771762e-05, "loss": 0.0508, "step": 15670 }, { "epoch": 4.10258423290808, "grad_norm": 0.4550129473209381, "learning_rate": 3.5646352383024504e-05, "loss": 0.043, "step": 15680 }, { "epoch": 4.105201177625123, "grad_norm": 0.5807818174362183, "learning_rate": 3.562764817164994e-05, "loss": 0.0471, "step": 15690 }, { "epoch": 4.107818122342166, "grad_norm": 0.604076087474823, "learning_rate": 3.560893669637805e-05, "loss": 0.0472, "step": 15700 }, { "epoch": 4.110435067059209, "grad_norm": 0.4531025290489197, "learning_rate": 3.5590217969997964e-05, "loss": 0.0495, "step": 15710 }, { "epoch": 4.113052011776251, "grad_norm": 0.3466964364051819, "learning_rate": 3.557149200530376e-05, "loss": 0.0438, "step": 15720 }, { "epoch": 4.115668956493294, "grad_norm": 0.4820724427700043, "learning_rate": 3.555275881509445e-05, "loss": 0.0461, "step": 15730 }, { "epoch": 4.118285901210337, "grad_norm": 0.586309552192688, "learning_rate": 3.5534018412174e-05, "loss": 0.0472, "step": 15740 }, { "epoch": 4.12090284592738, "grad_norm": 0.46379247307777405, "learning_rate": 3.55152708093513e-05, "loss": 0.0522, "step": 15750 }, { "epoch": 4.123519790644423, "grad_norm": 0.438968688249588, "learning_rate": 3.549651601944014e-05, "loss": 0.0503, "step": 15760 }, { "epoch": 4.126136735361466, "grad_norm": 0.5727360844612122, "learning_rate": 3.547775405525927e-05, "loss": 0.0491, "step": 15770 }, { "epoch": 4.128753680078509, "grad_norm": 0.5099532604217529, "learning_rate": 3.54589849296323e-05, "loss": 0.0523, "step": 15780 }, { "epoch": 4.131370624795551, "grad_norm": 0.7122123837471008, "learning_rate": 3.5440208655387754e-05, "loss": 0.0575, "step": 15790 }, { "epoch": 4.133987569512594, "grad_norm": 0.437671422958374, "learning_rate": 3.542142524535903e-05, "loss": 0.0481, "step": 15800 }, { "epoch": 4.136604514229637, "grad_norm": 0.4661213159561157, "learning_rate": 3.540263471238443e-05, "loss": 0.0571, "step": 15810 }, { "epoch": 4.13922145894668, "grad_norm": 0.6541229486465454, "learning_rate": 3.538383706930709e-05, "loss": 0.0499, "step": 15820 }, { "epoch": 4.141838403663723, "grad_norm": 0.3765426278114319, "learning_rate": 3.5365032328975025e-05, "loss": 0.0494, "step": 15830 }, { "epoch": 4.144455348380766, "grad_norm": 1.0478640794754028, "learning_rate": 3.53462205042411e-05, "loss": 0.052, "step": 15840 }, { "epoch": 4.147072293097808, "grad_norm": 0.5365044474601746, "learning_rate": 3.532740160796302e-05, "loss": 0.0524, "step": 15850 }, { "epoch": 4.149689237814851, "grad_norm": 0.6117434501647949, "learning_rate": 3.5308575653003314e-05, "loss": 0.055, "step": 15860 }, { "epoch": 4.152306182531894, "grad_norm": 0.6938223838806152, "learning_rate": 3.5289742652229366e-05, "loss": 0.0508, "step": 15870 }, { "epoch": 4.154923127248937, "grad_norm": 0.5880016684532166, "learning_rate": 3.527090261851334e-05, "loss": 0.0494, "step": 15880 }, { "epoch": 4.15754007196598, "grad_norm": 0.4967406094074249, "learning_rate": 3.525205556473221e-05, "loss": 0.0539, "step": 15890 }, { "epoch": 4.160157016683023, "grad_norm": 0.6478481888771057, "learning_rate": 3.5233201503767786e-05, "loss": 0.0573, "step": 15900 }, { "epoch": 4.162773961400066, "grad_norm": 0.5297810435295105, "learning_rate": 3.5214340448506624e-05, "loss": 0.0485, "step": 15910 }, { "epoch": 4.165390906117108, "grad_norm": 0.4374099671840668, "learning_rate": 3.519547241184008e-05, "loss": 0.0537, "step": 15920 }, { "epoch": 4.168007850834151, "grad_norm": 0.5356457829475403, "learning_rate": 3.517659740666429e-05, "loss": 0.0482, "step": 15930 }, { "epoch": 4.170624795551194, "grad_norm": 0.36456596851348877, "learning_rate": 3.5157715445880114e-05, "loss": 0.0465, "step": 15940 }, { "epoch": 4.173241740268237, "grad_norm": 0.5124794244766235, "learning_rate": 3.513882654239322e-05, "loss": 0.0489, "step": 15950 }, { "epoch": 4.17585868498528, "grad_norm": 0.6307095289230347, "learning_rate": 3.511993070911399e-05, "loss": 0.0493, "step": 15960 }, { "epoch": 4.178475629702323, "grad_norm": 0.6663428544998169, "learning_rate": 3.510102795895755e-05, "loss": 0.0511, "step": 15970 }, { "epoch": 4.181092574419365, "grad_norm": 0.684826135635376, "learning_rate": 3.508211830484374e-05, "loss": 0.0533, "step": 15980 }, { "epoch": 4.183709519136408, "grad_norm": 0.3565167188644409, "learning_rate": 3.506320175969714e-05, "loss": 0.0504, "step": 15990 }, { "epoch": 4.186326463853451, "grad_norm": 0.544464111328125, "learning_rate": 3.504427833644702e-05, "loss": 0.0454, "step": 16000 }, { "epoch": 4.186326463853451, "eval_loss": 0.05761846851942311, "eval_runtime": 8.7972, "eval_samples_per_second": 116.4, "eval_steps_per_second": 1.819, "step": 16000 }, { "epoch": 4.188943408570494, "grad_norm": 0.40490591526031494, "learning_rate": 3.502534804802738e-05, "loss": 0.0448, "step": 16010 }, { "epoch": 4.191560353287537, "grad_norm": 0.4140538275241852, "learning_rate": 3.500641090737689e-05, "loss": 0.0494, "step": 16020 }, { "epoch": 4.19417729800458, "grad_norm": 0.4455014765262604, "learning_rate": 3.4987466927438875e-05, "loss": 0.0481, "step": 16030 }, { "epoch": 4.196794242721623, "grad_norm": 0.4680039584636688, "learning_rate": 3.49685161211614e-05, "loss": 0.0458, "step": 16040 }, { "epoch": 4.199411187438665, "grad_norm": 0.45873773097991943, "learning_rate": 3.4949558501497166e-05, "loss": 0.0508, "step": 16050 }, { "epoch": 4.202028132155708, "grad_norm": 0.5634317994117737, "learning_rate": 3.49305940814035e-05, "loss": 0.0506, "step": 16060 }, { "epoch": 4.204645076872751, "grad_norm": 0.5611090660095215, "learning_rate": 3.4911622873842434e-05, "loss": 0.0518, "step": 16070 }, { "epoch": 4.207262021589794, "grad_norm": 0.4446581304073334, "learning_rate": 3.4892644891780586e-05, "loss": 0.0516, "step": 16080 }, { "epoch": 4.209878966306837, "grad_norm": 0.6510340571403503, "learning_rate": 3.487366014818923e-05, "loss": 0.0548, "step": 16090 }, { "epoch": 4.21249591102388, "grad_norm": 0.6392655372619629, "learning_rate": 3.485466865604427e-05, "loss": 0.042, "step": 16100 }, { "epoch": 4.215112855740922, "grad_norm": 0.6639431118965149, "learning_rate": 3.483567042832622e-05, "loss": 0.0478, "step": 16110 }, { "epoch": 4.217729800457965, "grad_norm": 0.5047462582588196, "learning_rate": 3.481666547802017e-05, "loss": 0.0476, "step": 16120 }, { "epoch": 4.220346745175008, "grad_norm": 0.35533663630485535, "learning_rate": 3.479765381811583e-05, "loss": 0.0443, "step": 16130 }, { "epoch": 4.222963689892051, "grad_norm": 0.36323872208595276, "learning_rate": 3.4778635461607486e-05, "loss": 0.0472, "step": 16140 }, { "epoch": 4.225580634609094, "grad_norm": 0.44074222445487976, "learning_rate": 3.4759610421494016e-05, "loss": 0.0487, "step": 16150 }, { "epoch": 4.228197579326137, "grad_norm": 1.4504610300064087, "learning_rate": 3.4740578710778845e-05, "loss": 0.0494, "step": 16160 }, { "epoch": 4.230814524043179, "grad_norm": 0.7710520029067993, "learning_rate": 3.472154034246998e-05, "loss": 0.0486, "step": 16170 }, { "epoch": 4.233431468760222, "grad_norm": 0.4526599943637848, "learning_rate": 3.470249532957996e-05, "loss": 0.0493, "step": 16180 }, { "epoch": 4.236048413477265, "grad_norm": 0.4356165826320648, "learning_rate": 3.4683443685125864e-05, "loss": 0.0557, "step": 16190 }, { "epoch": 4.238665358194308, "grad_norm": 0.8690013885498047, "learning_rate": 3.466438542212934e-05, "loss": 0.0577, "step": 16200 }, { "epoch": 4.241282302911351, "grad_norm": 0.4734126627445221, "learning_rate": 3.4645320553616485e-05, "loss": 0.0509, "step": 16210 }, { "epoch": 4.243899247628394, "grad_norm": 0.5786865949630737, "learning_rate": 3.462624909261799e-05, "loss": 0.0556, "step": 16220 }, { "epoch": 4.246516192345437, "grad_norm": 0.5717179179191589, "learning_rate": 3.460717105216901e-05, "loss": 0.0524, "step": 16230 }, { "epoch": 4.249133137062479, "grad_norm": 0.3806012272834778, "learning_rate": 3.4588086445309205e-05, "loss": 0.0433, "step": 16240 }, { "epoch": 4.251750081779522, "grad_norm": 0.6166380643844604, "learning_rate": 3.4568995285082735e-05, "loss": 0.0511, "step": 16250 }, { "epoch": 4.254367026496565, "grad_norm": 0.4418392479419708, "learning_rate": 3.454989758453821e-05, "loss": 0.0541, "step": 16260 }, { "epoch": 4.256983971213608, "grad_norm": 0.8362012505531311, "learning_rate": 3.453079335672873e-05, "loss": 0.0483, "step": 16270 }, { "epoch": 4.259600915930651, "grad_norm": 0.49030205607414246, "learning_rate": 3.451168261471187e-05, "loss": 0.0529, "step": 16280 }, { "epoch": 4.262217860647694, "grad_norm": 0.8152894973754883, "learning_rate": 3.449256537154962e-05, "loss": 0.0556, "step": 16290 }, { "epoch": 4.264834805364737, "grad_norm": 0.5284848213195801, "learning_rate": 3.4473441640308464e-05, "loss": 0.0564, "step": 16300 }, { "epoch": 4.267451750081779, "grad_norm": 0.6003028750419617, "learning_rate": 3.4454311434059266e-05, "loss": 0.046, "step": 16310 }, { "epoch": 4.270068694798822, "grad_norm": 0.5223946571350098, "learning_rate": 3.443517476587735e-05, "loss": 0.0446, "step": 16320 }, { "epoch": 4.272685639515865, "grad_norm": 0.5389282703399658, "learning_rate": 3.441603164884246e-05, "loss": 0.0554, "step": 16330 }, { "epoch": 4.275302584232908, "grad_norm": 0.591784656047821, "learning_rate": 3.4396882096038717e-05, "loss": 0.0473, "step": 16340 }, { "epoch": 4.277919528949951, "grad_norm": 0.762988805770874, "learning_rate": 3.4377726120554675e-05, "loss": 0.0479, "step": 16350 }, { "epoch": 4.280536473666994, "grad_norm": 0.5459496378898621, "learning_rate": 3.4358563735483254e-05, "loss": 0.0529, "step": 16360 }, { "epoch": 4.283153418384036, "grad_norm": 0.3209206163883209, "learning_rate": 3.4339394953921765e-05, "loss": 0.0447, "step": 16370 }, { "epoch": 4.285770363101079, "grad_norm": 0.39667803049087524, "learning_rate": 3.4320219788971884e-05, "loss": 0.0493, "step": 16380 }, { "epoch": 4.288387307818122, "grad_norm": 0.3870331048965454, "learning_rate": 3.430103825373967e-05, "loss": 0.0482, "step": 16390 }, { "epoch": 4.291004252535165, "grad_norm": 0.4382264316082001, "learning_rate": 3.428185036133552e-05, "loss": 0.0465, "step": 16400 }, { "epoch": 4.293621197252208, "grad_norm": 0.6332788467407227, "learning_rate": 3.426265612487416e-05, "loss": 0.0526, "step": 16410 }, { "epoch": 4.296238141969251, "grad_norm": 0.4075234532356262, "learning_rate": 3.424345555747468e-05, "loss": 0.0447, "step": 16420 }, { "epoch": 4.298855086686293, "grad_norm": 0.451867938041687, "learning_rate": 3.422424867226049e-05, "loss": 0.0503, "step": 16430 }, { "epoch": 4.301472031403336, "grad_norm": 0.5655390620231628, "learning_rate": 3.420503548235931e-05, "loss": 0.0455, "step": 16440 }, { "epoch": 4.304088976120379, "grad_norm": 0.47900524735450745, "learning_rate": 3.418581600090318e-05, "loss": 0.0493, "step": 16450 }, { "epoch": 4.306705920837422, "grad_norm": 0.46969595551490784, "learning_rate": 3.416659024102842e-05, "loss": 0.0459, "step": 16460 }, { "epoch": 4.309322865554465, "grad_norm": 0.42236611247062683, "learning_rate": 3.414735821587568e-05, "loss": 0.0419, "step": 16470 }, { "epoch": 4.311939810271508, "grad_norm": 0.6245198249816895, "learning_rate": 3.4128119938589844e-05, "loss": 0.0541, "step": 16480 }, { "epoch": 4.314556754988551, "grad_norm": 0.46810778975486755, "learning_rate": 3.410887542232011e-05, "loss": 0.0515, "step": 16490 }, { "epoch": 4.317173699705593, "grad_norm": 0.4198428690433502, "learning_rate": 3.408962468021991e-05, "loss": 0.0449, "step": 16500 }, { "epoch": 4.319790644422636, "grad_norm": 0.8210209012031555, "learning_rate": 3.407036772544695e-05, "loss": 0.0478, "step": 16510 }, { "epoch": 4.322407589139679, "grad_norm": 0.4958540201187134, "learning_rate": 3.405110457116318e-05, "loss": 0.0493, "step": 16520 }, { "epoch": 4.325024533856722, "grad_norm": 0.5449163913726807, "learning_rate": 3.403183523053479e-05, "loss": 0.0473, "step": 16530 }, { "epoch": 4.327641478573765, "grad_norm": 0.7345625758171082, "learning_rate": 3.4012559716732176e-05, "loss": 0.0481, "step": 16540 }, { "epoch": 4.330258423290808, "grad_norm": 0.4123026430606842, "learning_rate": 3.3993278042929986e-05, "loss": 0.0459, "step": 16550 }, { "epoch": 4.3328753680078504, "grad_norm": 0.5084407925605774, "learning_rate": 3.397399022230705e-05, "loss": 0.0447, "step": 16560 }, { "epoch": 4.335492312724893, "grad_norm": 0.45943090319633484, "learning_rate": 3.395469626804642e-05, "loss": 0.0447, "step": 16570 }, { "epoch": 4.338109257441936, "grad_norm": 0.4610060751438141, "learning_rate": 3.393539619333533e-05, "loss": 0.0543, "step": 16580 }, { "epoch": 4.340726202158979, "grad_norm": 0.5210998058319092, "learning_rate": 3.3916090011365195e-05, "loss": 0.0479, "step": 16590 }, { "epoch": 4.343343146876022, "grad_norm": 0.4095534384250641, "learning_rate": 3.389677773533161e-05, "loss": 0.0548, "step": 16600 }, { "epoch": 4.345960091593065, "grad_norm": 0.5919458270072937, "learning_rate": 3.387745937843433e-05, "loss": 0.0484, "step": 16610 }, { "epoch": 4.3485770363101075, "grad_norm": 0.41136297583580017, "learning_rate": 3.385813495387728e-05, "loss": 0.0457, "step": 16620 }, { "epoch": 4.3511939810271505, "grad_norm": 0.5454357266426086, "learning_rate": 3.383880447486852e-05, "loss": 0.0517, "step": 16630 }, { "epoch": 4.3538109257441935, "grad_norm": 0.7451463937759399, "learning_rate": 3.381946795462024e-05, "loss": 0.046, "step": 16640 }, { "epoch": 4.3564278704612365, "grad_norm": 0.5025386214256287, "learning_rate": 3.380012540634878e-05, "loss": 0.0459, "step": 16650 }, { "epoch": 4.3590448151782795, "grad_norm": 0.6015411019325256, "learning_rate": 3.3780776843274575e-05, "loss": 0.0471, "step": 16660 }, { "epoch": 4.3616617598953225, "grad_norm": 0.6264476776123047, "learning_rate": 3.376142227862221e-05, "loss": 0.0534, "step": 16670 }, { "epoch": 4.3642787046123654, "grad_norm": 0.44910645484924316, "learning_rate": 3.3742061725620325e-05, "loss": 0.0444, "step": 16680 }, { "epoch": 4.3668956493294075, "grad_norm": 0.5679300427436829, "learning_rate": 3.372269519750168e-05, "loss": 0.0553, "step": 16690 }, { "epoch": 4.3695125940464505, "grad_norm": 0.5327921509742737, "learning_rate": 3.370332270750313e-05, "loss": 0.0464, "step": 16700 }, { "epoch": 4.3721295387634935, "grad_norm": 0.4204285442829132, "learning_rate": 3.368394426886556e-05, "loss": 0.0429, "step": 16710 }, { "epoch": 4.3747464834805365, "grad_norm": 0.5075750946998596, "learning_rate": 3.366455989483398e-05, "loss": 0.0438, "step": 16720 }, { "epoch": 4.3773634281975795, "grad_norm": 0.4906587302684784, "learning_rate": 3.364516959865741e-05, "loss": 0.0462, "step": 16730 }, { "epoch": 4.3799803729146225, "grad_norm": 0.531676709651947, "learning_rate": 3.3625773393588935e-05, "loss": 0.0462, "step": 16740 }, { "epoch": 4.3825973176316655, "grad_norm": 0.47353696823120117, "learning_rate": 3.360637129288569e-05, "loss": 0.0486, "step": 16750 }, { "epoch": 4.385214262348708, "grad_norm": 0.5241471529006958, "learning_rate": 3.358696330980881e-05, "loss": 0.0494, "step": 16760 }, { "epoch": 4.387831207065751, "grad_norm": 0.3669177293777466, "learning_rate": 3.356754945762348e-05, "loss": 0.0436, "step": 16770 }, { "epoch": 4.390448151782794, "grad_norm": 0.41063836216926575, "learning_rate": 3.354812974959889e-05, "loss": 0.0492, "step": 16780 }, { "epoch": 4.393065096499837, "grad_norm": 0.4066520631313324, "learning_rate": 3.352870419900821e-05, "loss": 0.0438, "step": 16790 }, { "epoch": 4.3956820412168796, "grad_norm": 0.46243414282798767, "learning_rate": 3.350927281912864e-05, "loss": 0.0508, "step": 16800 }, { "epoch": 4.3982989859339225, "grad_norm": 0.42753133177757263, "learning_rate": 3.348983562324133e-05, "loss": 0.0474, "step": 16810 }, { "epoch": 4.400915930650965, "grad_norm": 0.3843385577201843, "learning_rate": 3.3470392624631425e-05, "loss": 0.0496, "step": 16820 }, { "epoch": 4.403532875368008, "grad_norm": 0.453375905752182, "learning_rate": 3.3450943836588034e-05, "loss": 0.0501, "step": 16830 }, { "epoch": 4.406149820085051, "grad_norm": 0.46848195791244507, "learning_rate": 3.3431489272404213e-05, "loss": 0.0484, "step": 16840 }, { "epoch": 4.408766764802094, "grad_norm": 0.40131011605262756, "learning_rate": 3.341202894537699e-05, "loss": 0.0477, "step": 16850 }, { "epoch": 4.411383709519137, "grad_norm": 0.4040091037750244, "learning_rate": 3.33925628688073e-05, "loss": 0.045, "step": 16860 }, { "epoch": 4.41400065423618, "grad_norm": 0.42682337760925293, "learning_rate": 3.337309105600002e-05, "loss": 0.0456, "step": 16870 }, { "epoch": 4.416617598953222, "grad_norm": 0.5235479474067688, "learning_rate": 3.335361352026396e-05, "loss": 0.0452, "step": 16880 }, { "epoch": 4.419234543670265, "grad_norm": 0.49944835901260376, "learning_rate": 3.3334130274911826e-05, "loss": 0.0501, "step": 16890 }, { "epoch": 4.421851488387308, "grad_norm": 0.5998557806015015, "learning_rate": 3.331464133326024e-05, "loss": 0.0439, "step": 16900 }, { "epoch": 4.424468433104351, "grad_norm": 0.506262481212616, "learning_rate": 3.329514670862971e-05, "loss": 0.0496, "step": 16910 }, { "epoch": 4.427085377821394, "grad_norm": 0.7459492087364197, "learning_rate": 3.3275646414344614e-05, "loss": 0.0416, "step": 16920 }, { "epoch": 4.429702322538437, "grad_norm": 0.4433119297027588, "learning_rate": 3.325614046373323e-05, "loss": 0.0473, "step": 16930 }, { "epoch": 4.43231926725548, "grad_norm": 0.3905789256095886, "learning_rate": 3.3236628870127696e-05, "loss": 0.0398, "step": 16940 }, { "epoch": 4.434936211972522, "grad_norm": 0.48078566789627075, "learning_rate": 3.321711164686399e-05, "loss": 0.0433, "step": 16950 }, { "epoch": 4.437553156689565, "grad_norm": 0.4129769802093506, "learning_rate": 3.319758880728196e-05, "loss": 0.0494, "step": 16960 }, { "epoch": 4.440170101406608, "grad_norm": 0.6216508746147156, "learning_rate": 3.317806036472527e-05, "loss": 0.0431, "step": 16970 }, { "epoch": 4.442787046123651, "grad_norm": 0.5008850693702698, "learning_rate": 3.3158526332541444e-05, "loss": 0.0447, "step": 16980 }, { "epoch": 4.445403990840694, "grad_norm": 0.30528682470321655, "learning_rate": 3.31389867240818e-05, "loss": 0.0482, "step": 16990 }, { "epoch": 4.448020935557737, "grad_norm": 0.5385652184486389, "learning_rate": 3.311944155270147e-05, "loss": 0.0476, "step": 17000 }, { "epoch": 4.448020935557737, "eval_loss": 0.05004162187291969, "eval_runtime": 9.0744, "eval_samples_per_second": 112.844, "eval_steps_per_second": 1.763, "step": 17000 }, { "epoch": 4.450637880274779, "grad_norm": 0.5225152969360352, "learning_rate": 3.309989083175941e-05, "loss": 0.0464, "step": 17010 }, { "epoch": 4.453254824991822, "grad_norm": 0.41770097613334656, "learning_rate": 3.308033457461833e-05, "loss": 0.0511, "step": 17020 }, { "epoch": 4.455871769708865, "grad_norm": 0.4127200245857239, "learning_rate": 3.3060772794644776e-05, "loss": 0.0447, "step": 17030 }, { "epoch": 4.458488714425908, "grad_norm": 0.32362911105155945, "learning_rate": 3.304120550520902e-05, "loss": 0.0485, "step": 17040 }, { "epoch": 4.461105659142951, "grad_norm": 0.42366665601730347, "learning_rate": 3.3021632719685125e-05, "loss": 0.0416, "step": 17050 }, { "epoch": 4.463722603859994, "grad_norm": 0.5108731389045715, "learning_rate": 3.30020544514509e-05, "loss": 0.0502, "step": 17060 }, { "epoch": 4.466339548577036, "grad_norm": 0.26923635601997375, "learning_rate": 3.2982470713887916e-05, "loss": 0.0397, "step": 17070 }, { "epoch": 4.468956493294079, "grad_norm": 0.36798936128616333, "learning_rate": 3.296288152038147e-05, "loss": 0.0506, "step": 17080 }, { "epoch": 4.471573438011122, "grad_norm": 0.5822156667709351, "learning_rate": 3.294328688432059e-05, "loss": 0.0494, "step": 17090 }, { "epoch": 4.474190382728165, "grad_norm": 0.5101978778839111, "learning_rate": 3.2923686819098024e-05, "loss": 0.0453, "step": 17100 }, { "epoch": 4.476807327445208, "grad_norm": 0.6352287530899048, "learning_rate": 3.290408133811024e-05, "loss": 0.0554, "step": 17110 }, { "epoch": 4.479424272162251, "grad_norm": 0.37760841846466064, "learning_rate": 3.288447045475739e-05, "loss": 0.0424, "step": 17120 }, { "epoch": 4.482041216879294, "grad_norm": 0.5223073363304138, "learning_rate": 3.2864854182443326e-05, "loss": 0.0471, "step": 17130 }, { "epoch": 4.484658161596336, "grad_norm": 0.650477945804596, "learning_rate": 3.2845232534575594e-05, "loss": 0.0515, "step": 17140 }, { "epoch": 4.487275106313379, "grad_norm": 0.6777361035346985, "learning_rate": 3.28256055245654e-05, "loss": 0.0453, "step": 17150 }, { "epoch": 4.489892051030422, "grad_norm": 0.693477213382721, "learning_rate": 3.2805973165827614e-05, "loss": 0.0445, "step": 17160 }, { "epoch": 4.492508995747465, "grad_norm": 0.5595983266830444, "learning_rate": 3.2786335471780774e-05, "loss": 0.0462, "step": 17170 }, { "epoch": 4.495125940464508, "grad_norm": 0.46131110191345215, "learning_rate": 3.276669245584707e-05, "loss": 0.0496, "step": 17180 }, { "epoch": 4.497742885181551, "grad_norm": 0.5225169658660889, "learning_rate": 3.27470441314523e-05, "loss": 0.045, "step": 17190 }, { "epoch": 4.500359829898594, "grad_norm": 0.5240589380264282, "learning_rate": 3.272739051202592e-05, "loss": 0.0467, "step": 17200 }, { "epoch": 4.502976774615636, "grad_norm": 0.6839200258255005, "learning_rate": 3.270773161100099e-05, "loss": 0.0506, "step": 17210 }, { "epoch": 4.505593719332679, "grad_norm": 0.3838503658771515, "learning_rate": 3.268806744181419e-05, "loss": 0.0483, "step": 17220 }, { "epoch": 4.508210664049722, "grad_norm": 0.45105141401290894, "learning_rate": 3.266839801790578e-05, "loss": 0.0546, "step": 17230 }, { "epoch": 4.510827608766765, "grad_norm": 0.34984609484672546, "learning_rate": 3.264872335271963e-05, "loss": 0.0441, "step": 17240 }, { "epoch": 4.513444553483808, "grad_norm": 0.35363954305648804, "learning_rate": 3.26290434597032e-05, "loss": 0.0513, "step": 17250 }, { "epoch": 4.51606149820085, "grad_norm": 0.5348709225654602, "learning_rate": 3.2609358352307496e-05, "loss": 0.0456, "step": 17260 }, { "epoch": 4.518678442917893, "grad_norm": 0.3599573075771332, "learning_rate": 3.258966804398711e-05, "loss": 0.0445, "step": 17270 }, { "epoch": 4.521295387634936, "grad_norm": 0.5039171576499939, "learning_rate": 3.256997254820019e-05, "loss": 0.0458, "step": 17280 }, { "epoch": 4.523912332351979, "grad_norm": 0.45528119802474976, "learning_rate": 3.255027187840841e-05, "loss": 0.0458, "step": 17290 }, { "epoch": 4.526529277069022, "grad_norm": 0.6549640893936157, "learning_rate": 3.253056604807699e-05, "loss": 0.051, "step": 17300 }, { "epoch": 4.529146221786065, "grad_norm": 0.5074205994606018, "learning_rate": 3.251085507067469e-05, "loss": 0.0428, "step": 17310 }, { "epoch": 4.531763166503108, "grad_norm": 0.3896365761756897, "learning_rate": 3.2491138959673776e-05, "loss": 0.049, "step": 17320 }, { "epoch": 4.53438011122015, "grad_norm": 0.5313991904258728, "learning_rate": 3.2471417728550015e-05, "loss": 0.0463, "step": 17330 }, { "epoch": 4.536997055937193, "grad_norm": 0.2488325983285904, "learning_rate": 3.245169139078269e-05, "loss": 0.0416, "step": 17340 }, { "epoch": 4.539614000654236, "grad_norm": 0.5214577913284302, "learning_rate": 3.243195995985456e-05, "loss": 0.0453, "step": 17350 }, { "epoch": 4.542230945371279, "grad_norm": 0.49365049600601196, "learning_rate": 3.2412223449251887e-05, "loss": 0.0429, "step": 17360 }, { "epoch": 4.544847890088322, "grad_norm": 0.5702369809150696, "learning_rate": 3.239248187246437e-05, "loss": 0.0493, "step": 17370 }, { "epoch": 4.547464834805365, "grad_norm": 0.5522635579109192, "learning_rate": 3.237273524298521e-05, "loss": 0.0446, "step": 17380 }, { "epoch": 4.550081779522408, "grad_norm": 0.4819454550743103, "learning_rate": 3.2352983574311025e-05, "loss": 0.047, "step": 17390 }, { "epoch": 4.55269872423945, "grad_norm": 0.38399413228034973, "learning_rate": 3.23332268799419e-05, "loss": 0.0512, "step": 17400 }, { "epoch": 4.555315668956493, "grad_norm": 0.5839704871177673, "learning_rate": 3.2313465173381355e-05, "loss": 0.0471, "step": 17410 }, { "epoch": 4.557932613673536, "grad_norm": 0.5217947959899902, "learning_rate": 3.2293698468136326e-05, "loss": 0.0443, "step": 17420 }, { "epoch": 4.560549558390579, "grad_norm": 0.4841725528240204, "learning_rate": 3.227392677771716e-05, "loss": 0.0478, "step": 17430 }, { "epoch": 4.563166503107622, "grad_norm": 0.5399268865585327, "learning_rate": 3.225415011563764e-05, "loss": 0.0392, "step": 17440 }, { "epoch": 4.565783447824665, "grad_norm": 0.3765436112880707, "learning_rate": 3.223436849541491e-05, "loss": 0.0431, "step": 17450 }, { "epoch": 4.568400392541708, "grad_norm": 0.4995643198490143, "learning_rate": 3.221458193056955e-05, "loss": 0.0477, "step": 17460 }, { "epoch": 4.57101733725875, "grad_norm": 0.37771889567375183, "learning_rate": 3.219479043462545e-05, "loss": 0.0504, "step": 17470 }, { "epoch": 4.573634281975793, "grad_norm": 0.4961005747318268, "learning_rate": 3.217499402110993e-05, "loss": 0.0456, "step": 17480 }, { "epoch": 4.576251226692836, "grad_norm": 0.6124542951583862, "learning_rate": 3.215519270355366e-05, "loss": 0.0458, "step": 17490 }, { "epoch": 4.578868171409879, "grad_norm": 0.41645899415016174, "learning_rate": 3.2135386495490644e-05, "loss": 0.0417, "step": 17500 }, { "epoch": 4.581485116126922, "grad_norm": 0.39719998836517334, "learning_rate": 3.2115575410458254e-05, "loss": 0.0423, "step": 17510 }, { "epoch": 4.584102060843964, "grad_norm": 0.39007094502449036, "learning_rate": 3.2095759461997146e-05, "loss": 0.0511, "step": 17520 }, { "epoch": 4.586719005561007, "grad_norm": 0.6247320771217346, "learning_rate": 3.2075938663651364e-05, "loss": 0.0527, "step": 17530 }, { "epoch": 4.58933595027805, "grad_norm": 0.5982612371444702, "learning_rate": 3.2056113028968224e-05, "loss": 0.0484, "step": 17540 }, { "epoch": 4.591952894995093, "grad_norm": 0.44412627816200256, "learning_rate": 3.203628257149837e-05, "loss": 0.0464, "step": 17550 }, { "epoch": 4.594569839712136, "grad_norm": 0.6911810040473938, "learning_rate": 3.2016447304795735e-05, "loss": 0.0468, "step": 17560 }, { "epoch": 4.597186784429179, "grad_norm": 0.5549867153167725, "learning_rate": 3.1996607242417506e-05, "loss": 0.0519, "step": 17570 }, { "epoch": 4.599803729146222, "grad_norm": 0.7803630828857422, "learning_rate": 3.197676239792422e-05, "loss": 0.0503, "step": 17580 }, { "epoch": 4.602420673863264, "grad_norm": 0.4282356798648834, "learning_rate": 3.195691278487961e-05, "loss": 0.0524, "step": 17590 }, { "epoch": 4.605037618580307, "grad_norm": 0.5289554595947266, "learning_rate": 3.193705841685072e-05, "loss": 0.0461, "step": 17600 }, { "epoch": 4.60765456329735, "grad_norm": 0.7792943120002747, "learning_rate": 3.191719930740781e-05, "loss": 0.0498, "step": 17610 }, { "epoch": 4.610271508014393, "grad_norm": 0.31162726879119873, "learning_rate": 3.189733547012439e-05, "loss": 0.0392, "step": 17620 }, { "epoch": 4.612888452731436, "grad_norm": 0.4618692696094513, "learning_rate": 3.187746691857723e-05, "loss": 0.0518, "step": 17630 }, { "epoch": 4.615505397448479, "grad_norm": 0.7038789987564087, "learning_rate": 3.185759366634627e-05, "loss": 0.051, "step": 17640 }, { "epoch": 4.618122342165522, "grad_norm": 0.6297800540924072, "learning_rate": 3.183771572701471e-05, "loss": 0.0509, "step": 17650 }, { "epoch": 4.620739286882564, "grad_norm": 0.5747802257537842, "learning_rate": 3.1817833114168924e-05, "loss": 0.0535, "step": 17660 }, { "epoch": 4.623356231599607, "grad_norm": 0.6170051097869873, "learning_rate": 3.179794584139849e-05, "loss": 0.0402, "step": 17670 }, { "epoch": 4.62597317631665, "grad_norm": 0.5372450351715088, "learning_rate": 3.177805392229617e-05, "loss": 0.0447, "step": 17680 }, { "epoch": 4.628590121033693, "grad_norm": 0.5293178558349609, "learning_rate": 3.175815737045792e-05, "loss": 0.0533, "step": 17690 }, { "epoch": 4.631207065750736, "grad_norm": 0.6131226420402527, "learning_rate": 3.173825619948283e-05, "loss": 0.0467, "step": 17700 }, { "epoch": 4.633824010467779, "grad_norm": 0.37538790702819824, "learning_rate": 3.171835042297317e-05, "loss": 0.0432, "step": 17710 }, { "epoch": 4.636440955184821, "grad_norm": 0.3324880301952362, "learning_rate": 3.169844005453433e-05, "loss": 0.0433, "step": 17720 }, { "epoch": 4.639057899901864, "grad_norm": 0.592828094959259, "learning_rate": 3.167852510777487e-05, "loss": 0.0471, "step": 17730 }, { "epoch": 4.641674844618907, "grad_norm": 0.4177514612674713, "learning_rate": 3.16586055963065e-05, "loss": 0.0417, "step": 17740 }, { "epoch": 4.64429178933595, "grad_norm": 0.3623928129673004, "learning_rate": 3.1638681533743975e-05, "loss": 0.0436, "step": 17750 }, { "epoch": 4.646908734052993, "grad_norm": 0.346414178609848, "learning_rate": 3.161875293370523e-05, "loss": 0.0426, "step": 17760 }, { "epoch": 4.649525678770036, "grad_norm": 0.3698805272579193, "learning_rate": 3.159881980981126e-05, "loss": 0.0412, "step": 17770 }, { "epoch": 4.652142623487078, "grad_norm": 0.4758628010749817, "learning_rate": 3.157888217568617e-05, "loss": 0.046, "step": 17780 }, { "epoch": 4.654759568204121, "grad_norm": 0.4483131170272827, "learning_rate": 3.155894004495716e-05, "loss": 0.0474, "step": 17790 }, { "epoch": 4.657376512921164, "grad_norm": 0.5199103951454163, "learning_rate": 3.153899343125446e-05, "loss": 0.0447, "step": 17800 }, { "epoch": 4.659993457638207, "grad_norm": 0.47452351450920105, "learning_rate": 3.151904234821142e-05, "loss": 0.0476, "step": 17810 }, { "epoch": 4.66261040235525, "grad_norm": 0.36638322472572327, "learning_rate": 3.14990868094644e-05, "loss": 0.0444, "step": 17820 }, { "epoch": 4.665227347072293, "grad_norm": 0.43507784605026245, "learning_rate": 3.147912682865283e-05, "loss": 0.0512, "step": 17830 }, { "epoch": 4.667844291789336, "grad_norm": 0.4801563322544098, "learning_rate": 3.145916241941917e-05, "loss": 0.0475, "step": 17840 }, { "epoch": 4.6704612365063785, "grad_norm": 0.5879807472229004, "learning_rate": 3.14391935954089e-05, "loss": 0.0471, "step": 17850 }, { "epoch": 4.6730781812234214, "grad_norm": 0.49333956837654114, "learning_rate": 3.141922037027053e-05, "loss": 0.0471, "step": 17860 }, { "epoch": 4.675695125940464, "grad_norm": 0.42784637212753296, "learning_rate": 3.139924275765556e-05, "loss": 0.0495, "step": 17870 }, { "epoch": 4.678312070657507, "grad_norm": 0.428956538438797, "learning_rate": 3.137926077121851e-05, "loss": 0.0391, "step": 17880 }, { "epoch": 4.68092901537455, "grad_norm": 0.46614134311676025, "learning_rate": 3.135927442461688e-05, "loss": 0.0468, "step": 17890 }, { "epoch": 4.683545960091593, "grad_norm": 0.5212433934211731, "learning_rate": 3.133928373151114e-05, "loss": 0.0401, "step": 17900 }, { "epoch": 4.686162904808636, "grad_norm": 0.43236270546913147, "learning_rate": 3.131928870556474e-05, "loss": 0.0479, "step": 17910 }, { "epoch": 4.6887798495256785, "grad_norm": 0.41202881932258606, "learning_rate": 3.129928936044411e-05, "loss": 0.0494, "step": 17920 }, { "epoch": 4.6913967942427215, "grad_norm": 0.45419037342071533, "learning_rate": 3.127928570981859e-05, "loss": 0.0455, "step": 17930 }, { "epoch": 4.6940137389597645, "grad_norm": 0.48706430196762085, "learning_rate": 3.1259277767360504e-05, "loss": 0.0427, "step": 17940 }, { "epoch": 4.6966306836768075, "grad_norm": 0.40641650557518005, "learning_rate": 3.123926554674508e-05, "loss": 0.0426, "step": 17950 }, { "epoch": 4.6992476283938505, "grad_norm": 0.5983295440673828, "learning_rate": 3.121924906165049e-05, "loss": 0.0453, "step": 17960 }, { "epoch": 4.701864573110893, "grad_norm": 0.5954302549362183, "learning_rate": 3.1199228325757814e-05, "loss": 0.0501, "step": 17970 }, { "epoch": 4.7044815178279356, "grad_norm": 0.3794120252132416, "learning_rate": 3.117920335275102e-05, "loss": 0.0392, "step": 17980 }, { "epoch": 4.7070984625449785, "grad_norm": 0.45704203844070435, "learning_rate": 3.115917415631702e-05, "loss": 0.0414, "step": 17990 }, { "epoch": 4.7097154072620215, "grad_norm": 0.44982025027275085, "learning_rate": 3.113914075014555e-05, "loss": 0.0438, "step": 18000 }, { "epoch": 4.7097154072620215, "eval_loss": 0.05080666894939734, "eval_runtime": 9.0538, "eval_samples_per_second": 113.102, "eval_steps_per_second": 1.767, "step": 18000 }, { "epoch": 4.7123323519790645, "grad_norm": 0.36351096630096436, "learning_rate": 3.111910314792926e-05, "loss": 0.0458, "step": 18010 }, { "epoch": 4.7149492966961075, "grad_norm": 0.513700008392334, "learning_rate": 3.1099061363363685e-05, "loss": 0.0462, "step": 18020 }, { "epoch": 4.7175662414131505, "grad_norm": 0.34745171666145325, "learning_rate": 3.107901541014717e-05, "loss": 0.0424, "step": 18030 }, { "epoch": 4.720183186130193, "grad_norm": 0.44875892996788025, "learning_rate": 3.105896530198094e-05, "loss": 0.0499, "step": 18040 }, { "epoch": 4.722800130847236, "grad_norm": 0.3810875117778778, "learning_rate": 3.1038911052569055e-05, "loss": 0.0504, "step": 18050 }, { "epoch": 4.725417075564279, "grad_norm": 0.48473119735717773, "learning_rate": 3.101885267561841e-05, "loss": 0.0415, "step": 18060 }, { "epoch": 4.728034020281322, "grad_norm": 0.5180307626724243, "learning_rate": 3.0998790184838735e-05, "loss": 0.0459, "step": 18070 }, { "epoch": 4.730650964998365, "grad_norm": 0.3327622413635254, "learning_rate": 3.0978723593942516e-05, "loss": 0.0426, "step": 18080 }, { "epoch": 4.733267909715408, "grad_norm": 0.48294293880462646, "learning_rate": 3.0958652916645104e-05, "loss": 0.0463, "step": 18090 }, { "epoch": 4.7358848544324506, "grad_norm": 0.4532416760921478, "learning_rate": 3.0938578166664604e-05, "loss": 0.0471, "step": 18100 }, { "epoch": 4.738501799149493, "grad_norm": 0.3738783895969391, "learning_rate": 3.091849935772193e-05, "loss": 0.0391, "step": 18110 }, { "epoch": 4.741118743866536, "grad_norm": 0.5960399508476257, "learning_rate": 3.089841650354076e-05, "loss": 0.0442, "step": 18120 }, { "epoch": 4.743735688583579, "grad_norm": 0.7043212652206421, "learning_rate": 3.0878329617847514e-05, "loss": 0.0393, "step": 18130 }, { "epoch": 4.746352633300622, "grad_norm": 0.42389801144599915, "learning_rate": 3.0858238714371405e-05, "loss": 0.052, "step": 18140 }, { "epoch": 4.748969578017665, "grad_norm": 0.4757963716983795, "learning_rate": 3.0838143806844374e-05, "loss": 0.0474, "step": 18150 }, { "epoch": 4.751586522734708, "grad_norm": 0.5247064232826233, "learning_rate": 3.081804490900111e-05, "loss": 0.0402, "step": 18160 }, { "epoch": 4.75420346745175, "grad_norm": 0.6219999194145203, "learning_rate": 3.0797942034579016e-05, "loss": 0.0512, "step": 18170 }, { "epoch": 4.756820412168793, "grad_norm": 0.5547354221343994, "learning_rate": 3.077783519731819e-05, "loss": 0.0448, "step": 18180 }, { "epoch": 4.759437356885836, "grad_norm": 0.25582262873649597, "learning_rate": 3.075772441096151e-05, "loss": 0.0425, "step": 18190 }, { "epoch": 4.762054301602879, "grad_norm": 0.3228617310523987, "learning_rate": 3.0737609689254473e-05, "loss": 0.041, "step": 18200 }, { "epoch": 4.764671246319922, "grad_norm": 0.35130971670150757, "learning_rate": 3.071749104594533e-05, "loss": 0.0433, "step": 18210 }, { "epoch": 4.767288191036965, "grad_norm": 0.5520963668823242, "learning_rate": 3.0697368494784966e-05, "loss": 0.0513, "step": 18220 }, { "epoch": 4.769905135754007, "grad_norm": 0.4022964835166931, "learning_rate": 3.067724204952695e-05, "loss": 0.0438, "step": 18230 }, { "epoch": 4.77252208047105, "grad_norm": 0.5612972378730774, "learning_rate": 3.0657111723927535e-05, "loss": 0.0433, "step": 18240 }, { "epoch": 4.775139025188093, "grad_norm": 0.5245701670646667, "learning_rate": 3.0636977531745595e-05, "loss": 0.0453, "step": 18250 }, { "epoch": 4.777755969905136, "grad_norm": 0.8202319741249084, "learning_rate": 3.0616839486742667e-05, "loss": 0.0482, "step": 18260 }, { "epoch": 4.780372914622179, "grad_norm": 0.45992013812065125, "learning_rate": 3.059669760268292e-05, "loss": 0.0444, "step": 18270 }, { "epoch": 4.782989859339222, "grad_norm": 0.5470813512802124, "learning_rate": 3.0576551893333124e-05, "loss": 0.0432, "step": 18280 }, { "epoch": 4.785606804056265, "grad_norm": 0.37620311975479126, "learning_rate": 3.05564023724627e-05, "loss": 0.0451, "step": 18290 }, { "epoch": 4.788223748773307, "grad_norm": 0.4782375395298004, "learning_rate": 3.053624905384364e-05, "loss": 0.0507, "step": 18300 }, { "epoch": 4.79084069349035, "grad_norm": 0.31268492341041565, "learning_rate": 3.0516091951250563e-05, "loss": 0.0431, "step": 18310 }, { "epoch": 4.793457638207393, "grad_norm": 0.37621551752090454, "learning_rate": 3.0495931078460654e-05, "loss": 0.0456, "step": 18320 }, { "epoch": 4.796074582924436, "grad_norm": 0.42091211676597595, "learning_rate": 3.047576644925367e-05, "loss": 0.0486, "step": 18330 }, { "epoch": 4.798691527641479, "grad_norm": 0.5487850904464722, "learning_rate": 3.0455598077411952e-05, "loss": 0.0493, "step": 18340 }, { "epoch": 4.801308472358522, "grad_norm": 0.6485093832015991, "learning_rate": 3.0435425976720395e-05, "loss": 0.0437, "step": 18350 }, { "epoch": 4.803925417075565, "grad_norm": 0.5202974081039429, "learning_rate": 3.041525016096643e-05, "loss": 0.044, "step": 18360 }, { "epoch": 4.806542361792607, "grad_norm": 0.3426308035850525, "learning_rate": 3.0395070643940048e-05, "loss": 0.0466, "step": 18370 }, { "epoch": 4.80915930650965, "grad_norm": 0.647731602191925, "learning_rate": 3.0374887439433748e-05, "loss": 0.0547, "step": 18380 }, { "epoch": 4.811776251226693, "grad_norm": 0.8727405071258545, "learning_rate": 3.0354700561242573e-05, "loss": 0.0429, "step": 18390 }, { "epoch": 4.814393195943736, "grad_norm": 0.7195037007331848, "learning_rate": 3.0334510023164054e-05, "loss": 0.0481, "step": 18400 }, { "epoch": 4.817010140660779, "grad_norm": 0.4770624041557312, "learning_rate": 3.031431583899823e-05, "loss": 0.0526, "step": 18410 }, { "epoch": 4.819627085377821, "grad_norm": 0.5087447166442871, "learning_rate": 3.0294118022547645e-05, "loss": 0.0431, "step": 18420 }, { "epoch": 4.822244030094864, "grad_norm": 0.4283955693244934, "learning_rate": 3.027391658761731e-05, "loss": 0.0485, "step": 18430 }, { "epoch": 4.824860974811907, "grad_norm": 0.34631991386413574, "learning_rate": 3.025371154801472e-05, "loss": 0.0416, "step": 18440 }, { "epoch": 4.82747791952895, "grad_norm": 0.2821793258190155, "learning_rate": 3.0233502917549826e-05, "loss": 0.0442, "step": 18450 }, { "epoch": 4.830094864245993, "grad_norm": 0.631080687046051, "learning_rate": 3.0213290710035035e-05, "loss": 0.0473, "step": 18460 }, { "epoch": 4.832711808963036, "grad_norm": 0.4035530090332031, "learning_rate": 3.0193074939285206e-05, "loss": 0.0453, "step": 18470 }, { "epoch": 4.835328753680079, "grad_norm": 0.40414726734161377, "learning_rate": 3.0172855619117612e-05, "loss": 0.0399, "step": 18480 }, { "epoch": 4.837945698397121, "grad_norm": 0.3670026957988739, "learning_rate": 3.0152632763351995e-05, "loss": 0.0447, "step": 18490 }, { "epoch": 4.840562643114164, "grad_norm": 0.4316958487033844, "learning_rate": 3.0132406385810463e-05, "loss": 0.0453, "step": 18500 }, { "epoch": 4.843179587831207, "grad_norm": 0.3881956934928894, "learning_rate": 3.011217650031756e-05, "loss": 0.0478, "step": 18510 }, { "epoch": 4.84579653254825, "grad_norm": 0.513076901435852, "learning_rate": 3.0091943120700233e-05, "loss": 0.0454, "step": 18520 }, { "epoch": 4.848413477265293, "grad_norm": 0.5722450613975525, "learning_rate": 3.0071706260787792e-05, "loss": 0.0418, "step": 18530 }, { "epoch": 4.851030421982336, "grad_norm": 0.4285776913166046, "learning_rate": 3.0051465934411944e-05, "loss": 0.0439, "step": 18540 }, { "epoch": 4.853647366699379, "grad_norm": 0.4009416401386261, "learning_rate": 3.0031222155406763e-05, "loss": 0.047, "step": 18550 }, { "epoch": 4.856264311416421, "grad_norm": 0.35369279980659485, "learning_rate": 3.0010974937608677e-05, "loss": 0.0452, "step": 18560 }, { "epoch": 4.858881256133464, "grad_norm": 0.443196564912796, "learning_rate": 2.9990724294856475e-05, "loss": 0.0441, "step": 18570 }, { "epoch": 4.861498200850507, "grad_norm": 0.5534233450889587, "learning_rate": 2.9970470240991284e-05, "loss": 0.0468, "step": 18580 }, { "epoch": 4.86411514556755, "grad_norm": 0.39392098784446716, "learning_rate": 2.9950212789856535e-05, "loss": 0.0414, "step": 18590 }, { "epoch": 4.866732090284593, "grad_norm": 0.3481430411338806, "learning_rate": 2.9929951955298035e-05, "loss": 0.0465, "step": 18600 }, { "epoch": 4.869349035001636, "grad_norm": 0.4503512680530548, "learning_rate": 2.9909687751163855e-05, "loss": 0.0438, "step": 18610 }, { "epoch": 4.871965979718678, "grad_norm": 0.4893753230571747, "learning_rate": 2.9889420191304397e-05, "loss": 0.0432, "step": 18620 }, { "epoch": 4.874582924435721, "grad_norm": 0.4819509983062744, "learning_rate": 2.9869149289572347e-05, "loss": 0.0426, "step": 18630 }, { "epoch": 4.877199869152764, "grad_norm": 0.47873371839523315, "learning_rate": 2.9848875059822657e-05, "loss": 0.0437, "step": 18640 }, { "epoch": 4.879816813869807, "grad_norm": 0.49186429381370544, "learning_rate": 2.98285975159126e-05, "loss": 0.0362, "step": 18650 }, { "epoch": 4.88243375858685, "grad_norm": 1.002996802330017, "learning_rate": 2.9808316671701658e-05, "loss": 0.0444, "step": 18660 }, { "epoch": 4.885050703303893, "grad_norm": 0.48731791973114014, "learning_rate": 2.978803254105162e-05, "loss": 0.0382, "step": 18670 }, { "epoch": 4.887667648020935, "grad_norm": 0.45587220788002014, "learning_rate": 2.9767745137826487e-05, "loss": 0.0473, "step": 18680 }, { "epoch": 4.890284592737978, "grad_norm": 0.5397971868515015, "learning_rate": 2.9747454475892505e-05, "loss": 0.0456, "step": 18690 }, { "epoch": 4.892901537455021, "grad_norm": 0.6106773018836975, "learning_rate": 2.972716056911816e-05, "loss": 0.0475, "step": 18700 }, { "epoch": 4.895518482172064, "grad_norm": 0.9047291874885559, "learning_rate": 2.9706863431374138e-05, "loss": 0.0441, "step": 18710 }, { "epoch": 4.898135426889107, "grad_norm": 0.6912985444068909, "learning_rate": 2.9686563076533347e-05, "loss": 0.0469, "step": 18720 }, { "epoch": 4.90075237160615, "grad_norm": 0.40746009349823, "learning_rate": 2.9666259518470885e-05, "loss": 0.042, "step": 18730 }, { "epoch": 4.903369316323193, "grad_norm": 0.4547899067401886, "learning_rate": 2.9645952771064035e-05, "loss": 0.0439, "step": 18740 }, { "epoch": 4.905986261040235, "grad_norm": 0.651161789894104, "learning_rate": 2.9625642848192283e-05, "loss": 0.0454, "step": 18750 }, { "epoch": 4.908603205757278, "grad_norm": 0.47897329926490784, "learning_rate": 2.9605329763737254e-05, "loss": 0.0445, "step": 18760 }, { "epoch": 4.911220150474321, "grad_norm": 0.5273807644844055, "learning_rate": 2.958501353158276e-05, "loss": 0.0433, "step": 18770 }, { "epoch": 4.913837095191364, "grad_norm": 0.7630659937858582, "learning_rate": 2.956469416561476e-05, "loss": 0.0457, "step": 18780 }, { "epoch": 4.916454039908407, "grad_norm": 0.6563160419464111, "learning_rate": 2.9544371679721326e-05, "loss": 0.0414, "step": 18790 }, { "epoch": 4.91907098462545, "grad_norm": 0.45346808433532715, "learning_rate": 2.952404608779271e-05, "loss": 0.0418, "step": 18800 }, { "epoch": 4.921687929342493, "grad_norm": 0.3976815938949585, "learning_rate": 2.950371740372125e-05, "loss": 0.0449, "step": 18810 }, { "epoch": 4.924304874059535, "grad_norm": 0.3786884546279907, "learning_rate": 2.9483385641401407e-05, "loss": 0.0416, "step": 18820 }, { "epoch": 4.926921818776578, "grad_norm": 0.4601690173149109, "learning_rate": 2.946305081472976e-05, "loss": 0.0477, "step": 18830 }, { "epoch": 4.929538763493621, "grad_norm": 0.5352820754051208, "learning_rate": 2.9442712937604962e-05, "loss": 0.0455, "step": 18840 }, { "epoch": 4.932155708210664, "grad_norm": 0.3937074840068817, "learning_rate": 2.9422372023927764e-05, "loss": 0.0439, "step": 18850 }, { "epoch": 4.934772652927707, "grad_norm": 0.5810989737510681, "learning_rate": 2.9402028087600992e-05, "loss": 0.0462, "step": 18860 }, { "epoch": 4.937389597644749, "grad_norm": 0.5032179951667786, "learning_rate": 2.938168114252952e-05, "loss": 0.0386, "step": 18870 }, { "epoch": 4.940006542361792, "grad_norm": 0.39301612973213196, "learning_rate": 2.936133120262031e-05, "loss": 0.0412, "step": 18880 }, { "epoch": 4.942623487078835, "grad_norm": 0.39664873480796814, "learning_rate": 2.934097828178235e-05, "loss": 0.0418, "step": 18890 }, { "epoch": 4.945240431795878, "grad_norm": 0.39426127076148987, "learning_rate": 2.9320622393926667e-05, "loss": 0.0425, "step": 18900 }, { "epoch": 4.947857376512921, "grad_norm": 0.27260127663612366, "learning_rate": 2.9300263552966324e-05, "loss": 0.0369, "step": 18910 }, { "epoch": 4.950474321229964, "grad_norm": 0.4988190233707428, "learning_rate": 2.927990177281638e-05, "loss": 0.05, "step": 18920 }, { "epoch": 4.953091265947007, "grad_norm": 0.4185965359210968, "learning_rate": 2.9259537067393937e-05, "loss": 0.0483, "step": 18930 }, { "epoch": 4.955708210664049, "grad_norm": 0.36746054887771606, "learning_rate": 2.923916945061807e-05, "loss": 0.0406, "step": 18940 }, { "epoch": 4.958325155381092, "grad_norm": 0.4041653871536255, "learning_rate": 2.9218798936409868e-05, "loss": 0.0375, "step": 18950 }, { "epoch": 4.960942100098135, "grad_norm": 0.40771132707595825, "learning_rate": 2.9198425538692365e-05, "loss": 0.0432, "step": 18960 }, { "epoch": 4.963559044815178, "grad_norm": 0.5572474598884583, "learning_rate": 2.917804927139059e-05, "loss": 0.0475, "step": 18970 }, { "epoch": 4.966175989532221, "grad_norm": 0.48255524039268494, "learning_rate": 2.915767014843154e-05, "loss": 0.0455, "step": 18980 }, { "epoch": 4.968792934249264, "grad_norm": 0.5636023283004761, "learning_rate": 2.913728818374415e-05, "loss": 0.0423, "step": 18990 }, { "epoch": 4.971409878966307, "grad_norm": 0.48205071687698364, "learning_rate": 2.9116903391259305e-05, "loss": 0.043, "step": 19000 }, { "epoch": 4.971409878966307, "eval_loss": 0.04874729268246368, "eval_runtime": 9.2011, "eval_samples_per_second": 111.291, "eval_steps_per_second": 1.739, "step": 19000 }, { "epoch": 4.974026823683349, "grad_norm": 0.4441853165626526, "learning_rate": 2.90965157849098e-05, "loss": 0.0416, "step": 19010 }, { "epoch": 4.976643768400392, "grad_norm": 0.439701110124588, "learning_rate": 2.907612537863038e-05, "loss": 0.0378, "step": 19020 }, { "epoch": 4.979260713117435, "grad_norm": 0.44340062141418457, "learning_rate": 2.9055732186357716e-05, "loss": 0.0444, "step": 19030 }, { "epoch": 4.981877657834478, "grad_norm": 0.439158171415329, "learning_rate": 2.903533622203033e-05, "loss": 0.0488, "step": 19040 }, { "epoch": 4.984494602551521, "grad_norm": 0.4943934977054596, "learning_rate": 2.9014937499588703e-05, "loss": 0.0457, "step": 19050 }, { "epoch": 4.987111547268564, "grad_norm": 0.47403454780578613, "learning_rate": 2.8994536032975145e-05, "loss": 0.0458, "step": 19060 }, { "epoch": 4.989728491985606, "grad_norm": 0.37442466616630554, "learning_rate": 2.8974131836133865e-05, "loss": 0.0419, "step": 19070 }, { "epoch": 4.992345436702649, "grad_norm": 0.49841246008872986, "learning_rate": 2.8953724923010965e-05, "loss": 0.0449, "step": 19080 }, { "epoch": 4.994962381419692, "grad_norm": 0.33906620740890503, "learning_rate": 2.893331530755436e-05, "loss": 0.0412, "step": 19090 }, { "epoch": 4.997579326136735, "grad_norm": 0.5222365260124207, "learning_rate": 2.8912903003713827e-05, "loss": 0.0493, "step": 19100 }, { "epoch": 5.0, "grad_norm": 0.2986067533493042, "learning_rate": 2.8892488025440982e-05, "loss": 0.0475, "step": 19110 }, { "epoch": 5.002616944717043, "grad_norm": 0.4610765874385834, "learning_rate": 2.8872070386689276e-05, "loss": 0.0459, "step": 19120 }, { "epoch": 5.005233889434086, "grad_norm": 0.47639599442481995, "learning_rate": 2.885165010141398e-05, "loss": 0.043, "step": 19130 }, { "epoch": 5.007850834151129, "grad_norm": 0.5547071695327759, "learning_rate": 2.8831227183572158e-05, "loss": 0.0444, "step": 19140 }, { "epoch": 5.010467778868171, "grad_norm": 0.6149731278419495, "learning_rate": 2.881080164712268e-05, "loss": 0.0496, "step": 19150 }, { "epoch": 5.013084723585214, "grad_norm": 0.3835916221141815, "learning_rate": 2.8790373506026208e-05, "loss": 0.0455, "step": 19160 }, { "epoch": 5.015701668302257, "grad_norm": 0.46660664677619934, "learning_rate": 2.8769942774245186e-05, "loss": 0.0374, "step": 19170 }, { "epoch": 5.0183186130193, "grad_norm": 0.5138661861419678, "learning_rate": 2.874950946574383e-05, "loss": 0.0461, "step": 19180 }, { "epoch": 5.020935557736343, "grad_norm": 0.5261203050613403, "learning_rate": 2.8729073594488104e-05, "loss": 0.0429, "step": 19190 }, { "epoch": 5.023552502453386, "grad_norm": 0.5713613629341125, "learning_rate": 2.870863517444575e-05, "loss": 0.0496, "step": 19200 }, { "epoch": 5.026169447170428, "grad_norm": 0.3695147931575775, "learning_rate": 2.868819421958621e-05, "loss": 0.0424, "step": 19210 }, { "epoch": 5.028786391887471, "grad_norm": 0.3375604450702667, "learning_rate": 2.86677507438807e-05, "loss": 0.0444, "step": 19220 }, { "epoch": 5.031403336604514, "grad_norm": 0.3388157784938812, "learning_rate": 2.8647304761302158e-05, "loss": 0.0431, "step": 19230 }, { "epoch": 5.034020281321557, "grad_norm": 0.3800851106643677, "learning_rate": 2.8626856285825188e-05, "loss": 0.0406, "step": 19240 }, { "epoch": 5.0366372260386, "grad_norm": 0.5340576171875, "learning_rate": 2.8606405331426145e-05, "loss": 0.0456, "step": 19250 }, { "epoch": 5.039254170755643, "grad_norm": 0.5981658101081848, "learning_rate": 2.8585951912083075e-05, "loss": 0.0478, "step": 19260 }, { "epoch": 5.041871115472686, "grad_norm": 0.40278035402297974, "learning_rate": 2.8565496041775674e-05, "loss": 0.0375, "step": 19270 }, { "epoch": 5.044488060189728, "grad_norm": 0.473084419965744, "learning_rate": 2.854503773448537e-05, "loss": 0.0424, "step": 19280 }, { "epoch": 5.047105004906771, "grad_norm": 0.726222813129425, "learning_rate": 2.8524577004195187e-05, "loss": 0.0509, "step": 19290 }, { "epoch": 5.049721949623814, "grad_norm": 0.45083606243133545, "learning_rate": 2.8504113864889855e-05, "loss": 0.0458, "step": 19300 }, { "epoch": 5.052338894340857, "grad_norm": 0.48179444670677185, "learning_rate": 2.848364833055574e-05, "loss": 0.0448, "step": 19310 }, { "epoch": 5.0549558390579, "grad_norm": 0.39184945821762085, "learning_rate": 2.846318041518084e-05, "loss": 0.0458, "step": 19320 }, { "epoch": 5.057572783774943, "grad_norm": 0.5279496908187866, "learning_rate": 2.844271013275479e-05, "loss": 0.0459, "step": 19330 }, { "epoch": 5.060189728491985, "grad_norm": 0.5475168824195862, "learning_rate": 2.8422237497268816e-05, "loss": 0.042, "step": 19340 }, { "epoch": 5.062806673209028, "grad_norm": 0.6053759455680847, "learning_rate": 2.840176252271578e-05, "loss": 0.0444, "step": 19350 }, { "epoch": 5.065423617926071, "grad_norm": 0.44237303733825684, "learning_rate": 2.838128522309015e-05, "loss": 0.04, "step": 19360 }, { "epoch": 5.068040562643114, "grad_norm": 0.5803554058074951, "learning_rate": 2.8360805612387946e-05, "loss": 0.0403, "step": 19370 }, { "epoch": 5.070657507360157, "grad_norm": 0.47988569736480713, "learning_rate": 2.8340323704606797e-05, "loss": 0.0489, "step": 19380 }, { "epoch": 5.0732744520772, "grad_norm": 0.6084237098693848, "learning_rate": 2.8319839513745895e-05, "loss": 0.0451, "step": 19390 }, { "epoch": 5.075891396794242, "grad_norm": 0.4827035367488861, "learning_rate": 2.8299353053805983e-05, "loss": 0.0464, "step": 19400 }, { "epoch": 5.078508341511285, "grad_norm": 0.5168353319168091, "learning_rate": 2.827886433878938e-05, "loss": 0.0414, "step": 19410 }, { "epoch": 5.081125286228328, "grad_norm": 0.44367021322250366, "learning_rate": 2.825837338269991e-05, "loss": 0.042, "step": 19420 }, { "epoch": 5.083742230945371, "grad_norm": 0.5074538588523865, "learning_rate": 2.8237880199542966e-05, "loss": 0.041, "step": 19430 }, { "epoch": 5.086359175662414, "grad_norm": 0.6296460628509521, "learning_rate": 2.8217384803325432e-05, "loss": 0.0481, "step": 19440 }, { "epoch": 5.088976120379457, "grad_norm": 0.34975865483283997, "learning_rate": 2.8196887208055716e-05, "loss": 0.0408, "step": 19450 }, { "epoch": 5.0915930650965, "grad_norm": 0.34852832555770874, "learning_rate": 2.8176387427743755e-05, "loss": 0.0407, "step": 19460 }, { "epoch": 5.094210009813542, "grad_norm": 0.365998238325119, "learning_rate": 2.815588547640093e-05, "loss": 0.0411, "step": 19470 }, { "epoch": 5.096826954530585, "grad_norm": 0.463615357875824, "learning_rate": 2.8135381368040135e-05, "loss": 0.0406, "step": 19480 }, { "epoch": 5.099443899247628, "grad_norm": 0.4203396737575531, "learning_rate": 2.811487511667574e-05, "loss": 0.0433, "step": 19490 }, { "epoch": 5.102060843964671, "grad_norm": 0.36783847212791443, "learning_rate": 2.809436673632358e-05, "loss": 0.0397, "step": 19500 }, { "epoch": 5.104677788681714, "grad_norm": 0.500868558883667, "learning_rate": 2.807385624100094e-05, "loss": 0.0498, "step": 19510 }, { "epoch": 5.107294733398757, "grad_norm": 0.34654349088668823, "learning_rate": 2.8053343644726533e-05, "loss": 0.0444, "step": 19520 }, { "epoch": 5.109911678115799, "grad_norm": 0.45738351345062256, "learning_rate": 2.803282896152054e-05, "loss": 0.0461, "step": 19530 }, { "epoch": 5.112528622832842, "grad_norm": 0.41508185863494873, "learning_rate": 2.8012312205404543e-05, "loss": 0.0419, "step": 19540 }, { "epoch": 5.115145567549885, "grad_norm": 0.36502814292907715, "learning_rate": 2.7991793390401567e-05, "loss": 0.0402, "step": 19550 }, { "epoch": 5.117762512266928, "grad_norm": 0.45576921105384827, "learning_rate": 2.7971272530536025e-05, "loss": 0.0447, "step": 19560 }, { "epoch": 5.120379456983971, "grad_norm": 0.47386133670806885, "learning_rate": 2.7950749639833713e-05, "loss": 0.0392, "step": 19570 }, { "epoch": 5.122996401701014, "grad_norm": 0.3990080952644348, "learning_rate": 2.793022473232185e-05, "loss": 0.0435, "step": 19580 }, { "epoch": 5.125613346418057, "grad_norm": 0.3933347165584564, "learning_rate": 2.7909697822029012e-05, "loss": 0.0406, "step": 19590 }, { "epoch": 5.128230291135099, "grad_norm": 0.34663620591163635, "learning_rate": 2.7889168922985155e-05, "loss": 0.0405, "step": 19600 }, { "epoch": 5.130847235852142, "grad_norm": 0.5836282968521118, "learning_rate": 2.786863804922158e-05, "loss": 0.0423, "step": 19610 }, { "epoch": 5.133464180569185, "grad_norm": 0.49704045057296753, "learning_rate": 2.7848105214770942e-05, "loss": 0.0431, "step": 19620 }, { "epoch": 5.136081125286228, "grad_norm": 0.477346807718277, "learning_rate": 2.7827570433667254e-05, "loss": 0.0441, "step": 19630 }, { "epoch": 5.138698070003271, "grad_norm": 0.44346386194229126, "learning_rate": 2.7807033719945828e-05, "loss": 0.0383, "step": 19640 }, { "epoch": 5.141315014720314, "grad_norm": 0.33207669854164124, "learning_rate": 2.778649508764333e-05, "loss": 0.0432, "step": 19650 }, { "epoch": 5.1439319594373565, "grad_norm": 0.5000958442687988, "learning_rate": 2.7765954550797718e-05, "loss": 0.0418, "step": 19660 }, { "epoch": 5.1465489041543995, "grad_norm": 0.8191132545471191, "learning_rate": 2.7745412123448245e-05, "loss": 0.0474, "step": 19670 }, { "epoch": 5.1491658488714425, "grad_norm": 0.4225716292858124, "learning_rate": 2.7724867819635476e-05, "loss": 0.0465, "step": 19680 }, { "epoch": 5.1517827935884855, "grad_norm": 0.30504903197288513, "learning_rate": 2.7704321653401245e-05, "loss": 0.0372, "step": 19690 }, { "epoch": 5.1543997383055284, "grad_norm": 0.47394004464149475, "learning_rate": 2.7683773638788664e-05, "loss": 0.0409, "step": 19700 }, { "epoch": 5.157016683022571, "grad_norm": 0.4267828166484833, "learning_rate": 2.766322378984211e-05, "loss": 0.0377, "step": 19710 }, { "epoch": 5.159633627739614, "grad_norm": 0.48577219247817993, "learning_rate": 2.7642672120607204e-05, "loss": 0.0382, "step": 19720 }, { "epoch": 5.1622505724566565, "grad_norm": 0.45582279562950134, "learning_rate": 2.7622118645130823e-05, "loss": 0.0421, "step": 19730 }, { "epoch": 5.1648675171736995, "grad_norm": 0.3249519169330597, "learning_rate": 2.7601563377461082e-05, "loss": 0.0429, "step": 19740 }, { "epoch": 5.1674844618907425, "grad_norm": 0.4997425079345703, "learning_rate": 2.7581006331647292e-05, "loss": 0.0452, "step": 19750 }, { "epoch": 5.1701014066077855, "grad_norm": 0.594337522983551, "learning_rate": 2.7560447521740017e-05, "loss": 0.0417, "step": 19760 }, { "epoch": 5.1727183513248285, "grad_norm": 0.49194571375846863, "learning_rate": 2.7539886961791e-05, "loss": 0.042, "step": 19770 }, { "epoch": 5.1753352960418715, "grad_norm": 0.40315955877304077, "learning_rate": 2.75193246658532e-05, "loss": 0.0442, "step": 19780 }, { "epoch": 5.177952240758914, "grad_norm": 0.5103853940963745, "learning_rate": 2.749876064798075e-05, "loss": 0.0455, "step": 19790 }, { "epoch": 5.180569185475957, "grad_norm": 0.40776246786117554, "learning_rate": 2.7478194922228952e-05, "loss": 0.0432, "step": 19800 }, { "epoch": 5.183186130193, "grad_norm": 0.4205770194530487, "learning_rate": 2.7457627502654294e-05, "loss": 0.0435, "step": 19810 }, { "epoch": 5.1858030749100426, "grad_norm": 0.3220876157283783, "learning_rate": 2.743705840331441e-05, "loss": 0.0417, "step": 19820 }, { "epoch": 5.1884200196270855, "grad_norm": 0.495869904756546, "learning_rate": 2.741648763826809e-05, "loss": 0.0426, "step": 19830 }, { "epoch": 5.1910369643441285, "grad_norm": 0.45162034034729004, "learning_rate": 2.7395915221575258e-05, "loss": 0.0414, "step": 19840 }, { "epoch": 5.193653909061171, "grad_norm": 0.34791964292526245, "learning_rate": 2.737534116729696e-05, "loss": 0.0408, "step": 19850 }, { "epoch": 5.196270853778214, "grad_norm": 0.4020437002182007, "learning_rate": 2.7354765489495375e-05, "loss": 0.0442, "step": 19860 }, { "epoch": 5.198887798495257, "grad_norm": 0.4706566035747528, "learning_rate": 2.733418820223378e-05, "loss": 0.0433, "step": 19870 }, { "epoch": 5.2015047432123, "grad_norm": 0.3812898099422455, "learning_rate": 2.731360931957656e-05, "loss": 0.0418, "step": 19880 }, { "epoch": 5.204121687929343, "grad_norm": 0.48587170243263245, "learning_rate": 2.7293028855589187e-05, "loss": 0.0423, "step": 19890 }, { "epoch": 5.206738632646386, "grad_norm": 0.5795092582702637, "learning_rate": 2.727244682433821e-05, "loss": 0.0441, "step": 19900 }, { "epoch": 5.209355577363429, "grad_norm": 0.41178473830223083, "learning_rate": 2.7251863239891262e-05, "loss": 0.0452, "step": 19910 }, { "epoch": 5.211972522080471, "grad_norm": 0.47530198097229004, "learning_rate": 2.7231278116317015e-05, "loss": 0.0406, "step": 19920 }, { "epoch": 5.214589466797514, "grad_norm": 0.3284004330635071, "learning_rate": 2.7210691467685222e-05, "loss": 0.0453, "step": 19930 }, { "epoch": 5.217206411514557, "grad_norm": 0.5120458602905273, "learning_rate": 2.7190103308066656e-05, "loss": 0.0456, "step": 19940 }, { "epoch": 5.2198233562316, "grad_norm": 0.500260055065155, "learning_rate": 2.7169513651533125e-05, "loss": 0.0352, "step": 19950 }, { "epoch": 5.222440300948643, "grad_norm": 0.5112075209617615, "learning_rate": 2.7148922512157476e-05, "loss": 0.0463, "step": 19960 }, { "epoch": 5.225057245665686, "grad_norm": 0.516282320022583, "learning_rate": 2.712832990401355e-05, "loss": 0.0425, "step": 19970 }, { "epoch": 5.227674190382728, "grad_norm": 0.48279473185539246, "learning_rate": 2.7107735841176206e-05, "loss": 0.0413, "step": 19980 }, { "epoch": 5.230291135099771, "grad_norm": 0.5420926809310913, "learning_rate": 2.708714033772129e-05, "loss": 0.0386, "step": 19990 }, { "epoch": 5.232908079816814, "grad_norm": 0.4328511357307434, "learning_rate": 2.706654340772563e-05, "loss": 0.0395, "step": 20000 }, { "epoch": 5.232908079816814, "eval_loss": 0.04762020907485075, "eval_runtime": 8.9323, "eval_samples_per_second": 114.64, "eval_steps_per_second": 1.791, "step": 20000 }, { "epoch": 5.235525024533857, "grad_norm": 0.6541851162910461, "learning_rate": 2.704594506526704e-05, "loss": 0.0422, "step": 20010 }, { "epoch": 5.2381419692509, "grad_norm": 0.474886953830719, "learning_rate": 2.7025345324424288e-05, "loss": 0.0378, "step": 20020 }, { "epoch": 5.240758913967943, "grad_norm": 0.35807138681411743, "learning_rate": 2.7004744199277104e-05, "loss": 0.0415, "step": 20030 }, { "epoch": 5.243375858684986, "grad_norm": 0.5502864718437195, "learning_rate": 2.698414170390617e-05, "loss": 0.0424, "step": 20040 }, { "epoch": 5.245992803402028, "grad_norm": 0.36918938159942627, "learning_rate": 2.6963537852393085e-05, "loss": 0.0387, "step": 20050 }, { "epoch": 5.248609748119071, "grad_norm": 0.4112201929092407, "learning_rate": 2.694293265882039e-05, "loss": 0.0343, "step": 20060 }, { "epoch": 5.251226692836114, "grad_norm": 0.5169610977172852, "learning_rate": 2.6922326137271555e-05, "loss": 0.0415, "step": 20070 }, { "epoch": 5.253843637553157, "grad_norm": 0.3329308032989502, "learning_rate": 2.690171830183092e-05, "loss": 0.041, "step": 20080 }, { "epoch": 5.2564605822702, "grad_norm": 0.5257441401481628, "learning_rate": 2.688110916658376e-05, "loss": 0.0442, "step": 20090 }, { "epoch": 5.259077526987243, "grad_norm": 0.3802300691604614, "learning_rate": 2.6860498745616218e-05, "loss": 0.035, "step": 20100 }, { "epoch": 5.261694471704285, "grad_norm": 0.4359605610370636, "learning_rate": 2.683988705301534e-05, "loss": 0.0405, "step": 20110 }, { "epoch": 5.264311416421328, "grad_norm": 0.42070209980010986, "learning_rate": 2.6819274102869002e-05, "loss": 0.0402, "step": 20120 }, { "epoch": 5.266928361138371, "grad_norm": 0.4507392644882202, "learning_rate": 2.679865990926597e-05, "loss": 0.0426, "step": 20130 }, { "epoch": 5.269545305855414, "grad_norm": 0.6350913643836975, "learning_rate": 2.677804448629585e-05, "loss": 0.0427, "step": 20140 }, { "epoch": 5.272162250572457, "grad_norm": 0.7911574840545654, "learning_rate": 2.6757427848049088e-05, "loss": 0.0391, "step": 20150 }, { "epoch": 5.2747791952895, "grad_norm": 0.5109353065490723, "learning_rate": 2.673681000861697e-05, "loss": 0.0464, "step": 20160 }, { "epoch": 5.277396140006543, "grad_norm": 0.5190655589103699, "learning_rate": 2.6716190982091588e-05, "loss": 0.042, "step": 20170 }, { "epoch": 5.280013084723585, "grad_norm": 0.3754146099090576, "learning_rate": 2.6695570782565843e-05, "loss": 0.0466, "step": 20180 }, { "epoch": 5.282630029440628, "grad_norm": 0.4404628872871399, "learning_rate": 2.6674949424133468e-05, "loss": 0.042, "step": 20190 }, { "epoch": 5.285246974157671, "grad_norm": 0.30781108140945435, "learning_rate": 2.6654326920888946e-05, "loss": 0.0377, "step": 20200 }, { "epoch": 5.287863918874714, "grad_norm": 0.4061555862426758, "learning_rate": 2.6633703286927576e-05, "loss": 0.0426, "step": 20210 }, { "epoch": 5.290480863591757, "grad_norm": 0.4692903757095337, "learning_rate": 2.6613078536345414e-05, "loss": 0.0476, "step": 20220 }, { "epoch": 5.2930978083088, "grad_norm": 0.46763700246810913, "learning_rate": 2.659245268323928e-05, "loss": 0.0456, "step": 20230 }, { "epoch": 5.295714753025842, "grad_norm": 0.6062338948249817, "learning_rate": 2.6571825741706762e-05, "loss": 0.04, "step": 20240 }, { "epoch": 5.298331697742885, "grad_norm": 0.4335156977176666, "learning_rate": 2.655119772584616e-05, "loss": 0.0408, "step": 20250 }, { "epoch": 5.300948642459928, "grad_norm": 0.4853779077529907, "learning_rate": 2.653056864975655e-05, "loss": 0.0441, "step": 20260 }, { "epoch": 5.303565587176971, "grad_norm": 0.17079606652259827, "learning_rate": 2.65099385275377e-05, "loss": 0.0404, "step": 20270 }, { "epoch": 5.306182531894014, "grad_norm": 0.32185569405555725, "learning_rate": 2.6489307373290096e-05, "loss": 0.039, "step": 20280 }, { "epoch": 5.308799476611057, "grad_norm": 0.3419385254383087, "learning_rate": 2.646867520111495e-05, "loss": 0.0381, "step": 20290 }, { "epoch": 5.311416421328099, "grad_norm": 0.5202774405479431, "learning_rate": 2.644804202511415e-05, "loss": 0.0473, "step": 20300 }, { "epoch": 5.314033366045142, "grad_norm": 0.416843056678772, "learning_rate": 2.642740785939028e-05, "loss": 0.04, "step": 20310 }, { "epoch": 5.316650310762185, "grad_norm": 0.47133147716522217, "learning_rate": 2.6406772718046603e-05, "loss": 0.0404, "step": 20320 }, { "epoch": 5.319267255479228, "grad_norm": 0.37643080949783325, "learning_rate": 2.638613661518703e-05, "loss": 0.038, "step": 20330 }, { "epoch": 5.321884200196271, "grad_norm": 0.5968944430351257, "learning_rate": 2.6365499564916163e-05, "loss": 0.04, "step": 20340 }, { "epoch": 5.324501144913314, "grad_norm": 0.4017389416694641, "learning_rate": 2.6344861581339216e-05, "loss": 0.0424, "step": 20350 }, { "epoch": 5.327118089630357, "grad_norm": 0.5358704924583435, "learning_rate": 2.632422267856205e-05, "loss": 0.039, "step": 20360 }, { "epoch": 5.329735034347399, "grad_norm": 0.39304521679878235, "learning_rate": 2.6303582870691175e-05, "loss": 0.0434, "step": 20370 }, { "epoch": 5.332351979064442, "grad_norm": 0.3788582384586334, "learning_rate": 2.6282942171833695e-05, "loss": 0.0467, "step": 20380 }, { "epoch": 5.334968923781485, "grad_norm": 0.6729533672332764, "learning_rate": 2.626230059609735e-05, "loss": 0.0419, "step": 20390 }, { "epoch": 5.337585868498528, "grad_norm": 0.5997945070266724, "learning_rate": 2.6241658157590444e-05, "loss": 0.047, "step": 20400 }, { "epoch": 5.340202813215571, "grad_norm": 0.5033451318740845, "learning_rate": 2.6221014870421895e-05, "loss": 0.0393, "step": 20410 }, { "epoch": 5.342819757932614, "grad_norm": 0.5576150417327881, "learning_rate": 2.6200370748701196e-05, "loss": 0.034, "step": 20420 }, { "epoch": 5.345436702649657, "grad_norm": 0.6364089846611023, "learning_rate": 2.6179725806538407e-05, "loss": 0.0383, "step": 20430 }, { "epoch": 5.348053647366699, "grad_norm": 0.5496673583984375, "learning_rate": 2.615908005804416e-05, "loss": 0.0428, "step": 20440 }, { "epoch": 5.350670592083742, "grad_norm": 0.3670984208583832, "learning_rate": 2.613843351732962e-05, "loss": 0.0398, "step": 20450 }, { "epoch": 5.353287536800785, "grad_norm": 0.5018607974052429, "learning_rate": 2.61177861985065e-05, "loss": 0.0393, "step": 20460 }, { "epoch": 5.355904481517828, "grad_norm": 0.40461036562919617, "learning_rate": 2.6097138115687057e-05, "loss": 0.0376, "step": 20470 }, { "epoch": 5.358521426234871, "grad_norm": 0.5538284778594971, "learning_rate": 2.607648928298405e-05, "loss": 0.0434, "step": 20480 }, { "epoch": 5.361138370951914, "grad_norm": 0.40360862016677856, "learning_rate": 2.6055839714510782e-05, "loss": 0.0433, "step": 20490 }, { "epoch": 5.363755315668956, "grad_norm": 0.419687956571579, "learning_rate": 2.6035189424381024e-05, "loss": 0.0416, "step": 20500 }, { "epoch": 5.366372260385999, "grad_norm": 0.4900745451450348, "learning_rate": 2.6014538426709046e-05, "loss": 0.0361, "step": 20510 }, { "epoch": 5.368989205103042, "grad_norm": 0.48524007201194763, "learning_rate": 2.599388673560963e-05, "loss": 0.0398, "step": 20520 }, { "epoch": 5.371606149820085, "grad_norm": 0.5027137994766235, "learning_rate": 2.597323436519799e-05, "loss": 0.0391, "step": 20530 }, { "epoch": 5.374223094537128, "grad_norm": 0.31066951155662537, "learning_rate": 2.5952581329589848e-05, "loss": 0.0432, "step": 20540 }, { "epoch": 5.376840039254171, "grad_norm": 0.5412424206733704, "learning_rate": 2.593192764290135e-05, "loss": 0.0409, "step": 20550 }, { "epoch": 5.379456983971213, "grad_norm": 0.3911846876144409, "learning_rate": 2.591127331924909e-05, "loss": 0.0382, "step": 20560 }, { "epoch": 5.382073928688256, "grad_norm": 0.31931638717651367, "learning_rate": 2.5890618372750115e-05, "loss": 0.0428, "step": 20570 }, { "epoch": 5.384690873405299, "grad_norm": 0.5206882953643799, "learning_rate": 2.5869962817521876e-05, "loss": 0.0442, "step": 20580 }, { "epoch": 5.387307818122342, "grad_norm": 0.5111570954322815, "learning_rate": 2.5849306667682255e-05, "loss": 0.041, "step": 20590 }, { "epoch": 5.389924762839385, "grad_norm": 0.42905184626579285, "learning_rate": 2.5828649937349535e-05, "loss": 0.0406, "step": 20600 }, { "epoch": 5.392541707556428, "grad_norm": 0.5467479228973389, "learning_rate": 2.580799264064239e-05, "loss": 0.0422, "step": 20610 }, { "epoch": 5.395158652273471, "grad_norm": 0.5880939960479736, "learning_rate": 2.5787334791679906e-05, "loss": 0.0384, "step": 20620 }, { "epoch": 5.397775596990513, "grad_norm": 0.7603851556777954, "learning_rate": 2.5766676404581512e-05, "loss": 0.0398, "step": 20630 }, { "epoch": 5.400392541707556, "grad_norm": 0.5429633855819702, "learning_rate": 2.5746017493467023e-05, "loss": 0.0418, "step": 20640 }, { "epoch": 5.403009486424599, "grad_norm": 0.3459802269935608, "learning_rate": 2.5725358072456612e-05, "loss": 0.0374, "step": 20650 }, { "epoch": 5.405626431141642, "grad_norm": 0.5180562138557434, "learning_rate": 2.5704698155670797e-05, "loss": 0.042, "step": 20660 }, { "epoch": 5.408243375858685, "grad_norm": 0.4653855264186859, "learning_rate": 2.5684037757230444e-05, "loss": 0.0393, "step": 20670 }, { "epoch": 5.410860320575728, "grad_norm": 0.34826746582984924, "learning_rate": 2.566337689125673e-05, "loss": 0.0391, "step": 20680 }, { "epoch": 5.41347726529277, "grad_norm": 0.43290433287620544, "learning_rate": 2.5642715571871162e-05, "loss": 0.0443, "step": 20690 }, { "epoch": 5.416094210009813, "grad_norm": 0.3276048004627228, "learning_rate": 2.5622053813195568e-05, "loss": 0.0412, "step": 20700 }, { "epoch": 5.418711154726856, "grad_norm": 0.3624131381511688, "learning_rate": 2.560139162935205e-05, "loss": 0.0422, "step": 20710 }, { "epoch": 5.421328099443899, "grad_norm": 0.4569075107574463, "learning_rate": 2.5580729034463036e-05, "loss": 0.0422, "step": 20720 }, { "epoch": 5.423945044160942, "grad_norm": 0.35856783390045166, "learning_rate": 2.5560066042651192e-05, "loss": 0.0433, "step": 20730 }, { "epoch": 5.426561988877985, "grad_norm": 0.5207006335258484, "learning_rate": 2.553940266803949e-05, "loss": 0.0404, "step": 20740 }, { "epoch": 5.429178933595027, "grad_norm": 0.5183231234550476, "learning_rate": 2.5518738924751155e-05, "loss": 0.0358, "step": 20750 }, { "epoch": 5.43179587831207, "grad_norm": 0.450226753950119, "learning_rate": 2.549807482690965e-05, "loss": 0.0376, "step": 20760 }, { "epoch": 5.434412823029113, "grad_norm": 0.42255696654319763, "learning_rate": 2.547741038863871e-05, "loss": 0.0359, "step": 20770 }, { "epoch": 5.437029767746156, "grad_norm": 0.6406026482582092, "learning_rate": 2.545674562406226e-05, "loss": 0.0421, "step": 20780 }, { "epoch": 5.439646712463199, "grad_norm": 0.3330078721046448, "learning_rate": 2.5436080547304485e-05, "loss": 0.0397, "step": 20790 }, { "epoch": 5.442263657180242, "grad_norm": 0.32984280586242676, "learning_rate": 2.541541517248977e-05, "loss": 0.0357, "step": 20800 }, { "epoch": 5.444880601897285, "grad_norm": 0.2796110510826111, "learning_rate": 2.53947495137427e-05, "loss": 0.0369, "step": 20810 }, { "epoch": 5.447497546614327, "grad_norm": 0.3588564991950989, "learning_rate": 2.537408358518807e-05, "loss": 0.0405, "step": 20820 }, { "epoch": 5.45011449133137, "grad_norm": 0.41805946826934814, "learning_rate": 2.5353417400950825e-05, "loss": 0.0419, "step": 20830 }, { "epoch": 5.452731436048413, "grad_norm": 0.25932222604751587, "learning_rate": 2.5332750975156115e-05, "loss": 0.0359, "step": 20840 }, { "epoch": 5.455348380765456, "grad_norm": 0.37364596128463745, "learning_rate": 2.531208432192926e-05, "loss": 0.041, "step": 20850 }, { "epoch": 5.457965325482499, "grad_norm": 0.47812214493751526, "learning_rate": 2.529141745539571e-05, "loss": 0.0375, "step": 20860 }, { "epoch": 5.460582270199542, "grad_norm": 0.6243589520454407, "learning_rate": 2.527075038968108e-05, "loss": 0.0466, "step": 20870 }, { "epoch": 5.463199214916585, "grad_norm": 0.45652157068252563, "learning_rate": 2.5250083138911107e-05, "loss": 0.0359, "step": 20880 }, { "epoch": 5.465816159633627, "grad_norm": 0.4943414330482483, "learning_rate": 2.5229415717211667e-05, "loss": 0.0402, "step": 20890 }, { "epoch": 5.46843310435067, "grad_norm": 0.6210593581199646, "learning_rate": 2.5208748138708753e-05, "loss": 0.0427, "step": 20900 }, { "epoch": 5.471050049067713, "grad_norm": 0.391830712556839, "learning_rate": 2.5188080417528454e-05, "loss": 0.0438, "step": 20910 }, { "epoch": 5.473666993784756, "grad_norm": 0.4893028736114502, "learning_rate": 2.5167412567796968e-05, "loss": 0.0433, "step": 20920 }, { "epoch": 5.476283938501799, "grad_norm": 0.5769691467285156, "learning_rate": 2.5146744603640555e-05, "loss": 0.0374, "step": 20930 }, { "epoch": 5.478900883218842, "grad_norm": 0.4640278220176697, "learning_rate": 2.5126076539185593e-05, "loss": 0.0423, "step": 20940 }, { "epoch": 5.4815178279358845, "grad_norm": 0.535298228263855, "learning_rate": 2.510540838855852e-05, "loss": 0.0405, "step": 20950 }, { "epoch": 5.4841347726529275, "grad_norm": 0.5258188247680664, "learning_rate": 2.5084740165885795e-05, "loss": 0.0381, "step": 20960 }, { "epoch": 5.4867517173699705, "grad_norm": 0.3611849546432495, "learning_rate": 2.5064071885293964e-05, "loss": 0.0448, "step": 20970 }, { "epoch": 5.4893686620870135, "grad_norm": 0.693965494632721, "learning_rate": 2.5043403560909605e-05, "loss": 0.0463, "step": 20980 }, { "epoch": 5.4919856068040565, "grad_norm": 0.4176058769226074, "learning_rate": 2.5022735206859323e-05, "loss": 0.0416, "step": 20990 }, { "epoch": 5.494602551521099, "grad_norm": 0.3145519495010376, "learning_rate": 2.500206683726975e-05, "loss": 0.0377, "step": 21000 }, { "epoch": 5.494602551521099, "eval_loss": 0.04668252001432974, "eval_runtime": 9.0331, "eval_samples_per_second": 113.36, "eval_steps_per_second": 1.771, "step": 21000 }, { "epoch": 5.4972194962381415, "grad_norm": 0.4744648337364197, "learning_rate": 2.4981398466267496e-05, "loss": 0.0433, "step": 21010 }, { "epoch": 5.4998364409551845, "grad_norm": 0.48282021284103394, "learning_rate": 2.4960730107979233e-05, "loss": 0.0415, "step": 21020 }, { "epoch": 5.5024533856722275, "grad_norm": 0.34886685013771057, "learning_rate": 2.4940061776531565e-05, "loss": 0.0409, "step": 21030 }, { "epoch": 5.5050703303892705, "grad_norm": 0.5502273440361023, "learning_rate": 2.49193934860511e-05, "loss": 0.0451, "step": 21040 }, { "epoch": 5.5076872751063135, "grad_norm": 0.33859339356422424, "learning_rate": 2.4898725250664433e-05, "loss": 0.0389, "step": 21050 }, { "epoch": 5.5103042198233565, "grad_norm": 0.25911879539489746, "learning_rate": 2.48780570844981e-05, "loss": 0.036, "step": 21060 }, { "epoch": 5.5129211645403995, "grad_norm": 0.2707931697368622, "learning_rate": 2.4857389001678606e-05, "loss": 0.0385, "step": 21070 }, { "epoch": 5.515538109257442, "grad_norm": 0.3240604102611542, "learning_rate": 2.4836721016332374e-05, "loss": 0.0399, "step": 21080 }, { "epoch": 5.518155053974485, "grad_norm": 0.551508903503418, "learning_rate": 2.4816053142585792e-05, "loss": 0.0388, "step": 21090 }, { "epoch": 5.520771998691528, "grad_norm": 0.43382567167282104, "learning_rate": 2.4795385394565154e-05, "loss": 0.0383, "step": 21100 }, { "epoch": 5.523388943408571, "grad_norm": 0.4543988108634949, "learning_rate": 2.4774717786396666e-05, "loss": 0.039, "step": 21110 }, { "epoch": 5.5260058881256136, "grad_norm": 0.45710262656211853, "learning_rate": 2.4754050332206458e-05, "loss": 0.0348, "step": 21120 }, { "epoch": 5.5286228328426565, "grad_norm": 0.3298129439353943, "learning_rate": 2.4733383046120523e-05, "loss": 0.0423, "step": 21130 }, { "epoch": 5.5312397775596995, "grad_norm": 0.5027996897697449, "learning_rate": 2.471271594226476e-05, "loss": 0.0369, "step": 21140 }, { "epoch": 5.533856722276742, "grad_norm": 0.49993273615837097, "learning_rate": 2.469204903476494e-05, "loss": 0.0426, "step": 21150 }, { "epoch": 5.536473666993785, "grad_norm": 0.5039677619934082, "learning_rate": 2.46713823377467e-05, "loss": 0.0372, "step": 21160 }, { "epoch": 5.539090611710828, "grad_norm": 0.5910190939903259, "learning_rate": 2.465071586533554e-05, "loss": 0.0405, "step": 21170 }, { "epoch": 5.541707556427871, "grad_norm": 0.5953308939933777, "learning_rate": 2.4630049631656782e-05, "loss": 0.0379, "step": 21180 }, { "epoch": 5.544324501144914, "grad_norm": 0.31143149733543396, "learning_rate": 2.4609383650835616e-05, "loss": 0.0414, "step": 21190 }, { "epoch": 5.546941445861956, "grad_norm": 0.3030470609664917, "learning_rate": 2.4588717936997045e-05, "loss": 0.0434, "step": 21200 }, { "epoch": 5.549558390578999, "grad_norm": 0.2863839566707611, "learning_rate": 2.456805250426589e-05, "loss": 0.0352, "step": 21210 }, { "epoch": 5.552175335296042, "grad_norm": 0.37792912125587463, "learning_rate": 2.454738736676677e-05, "loss": 0.038, "step": 21220 }, { "epoch": 5.554792280013085, "grad_norm": 0.3757082521915436, "learning_rate": 2.4526722538624118e-05, "loss": 0.0414, "step": 21230 }, { "epoch": 5.557409224730128, "grad_norm": 0.3112926185131073, "learning_rate": 2.4506058033962146e-05, "loss": 0.0403, "step": 21240 }, { "epoch": 5.560026169447171, "grad_norm": 0.30436331033706665, "learning_rate": 2.448539386690485e-05, "loss": 0.0412, "step": 21250 }, { "epoch": 5.562643114164214, "grad_norm": 0.30948469042778015, "learning_rate": 2.4464730051575994e-05, "loss": 0.0351, "step": 21260 }, { "epoch": 5.565260058881256, "grad_norm": 0.3362123668193817, "learning_rate": 2.4444066602099102e-05, "loss": 0.0367, "step": 21270 }, { "epoch": 5.567877003598299, "grad_norm": 0.49534282088279724, "learning_rate": 2.4423403532597443e-05, "loss": 0.0352, "step": 21280 }, { "epoch": 5.570493948315342, "grad_norm": 0.4199749529361725, "learning_rate": 2.440274085719403e-05, "loss": 0.0385, "step": 21290 }, { "epoch": 5.573110893032385, "grad_norm": 0.4081372618675232, "learning_rate": 2.4382078590011622e-05, "loss": 0.0405, "step": 21300 }, { "epoch": 5.575727837749428, "grad_norm": 0.26627975702285767, "learning_rate": 2.4361416745172665e-05, "loss": 0.0434, "step": 21310 }, { "epoch": 5.578344782466471, "grad_norm": 0.43419206142425537, "learning_rate": 2.4340755336799337e-05, "loss": 0.0365, "step": 21320 }, { "epoch": 5.580961727183514, "grad_norm": 0.6178485155105591, "learning_rate": 2.4320094379013523e-05, "loss": 0.0397, "step": 21330 }, { "epoch": 5.583578671900556, "grad_norm": 0.3559263050556183, "learning_rate": 2.4299433885936784e-05, "loss": 0.0401, "step": 21340 }, { "epoch": 5.586195616617599, "grad_norm": 0.5083751082420349, "learning_rate": 2.4278773871690386e-05, "loss": 0.0426, "step": 21350 }, { "epoch": 5.588812561334642, "grad_norm": 0.5449803471565247, "learning_rate": 2.425811435039524e-05, "loss": 0.0408, "step": 21360 }, { "epoch": 5.591429506051685, "grad_norm": 0.5429864525794983, "learning_rate": 2.4237455336171944e-05, "loss": 0.0424, "step": 21370 }, { "epoch": 5.594046450768728, "grad_norm": 0.32913637161254883, "learning_rate": 2.421679684314073e-05, "loss": 0.0377, "step": 21380 }, { "epoch": 5.596663395485771, "grad_norm": 0.2501143217086792, "learning_rate": 2.4196138885421488e-05, "loss": 0.0342, "step": 21390 }, { "epoch": 5.599280340202813, "grad_norm": 0.3179362714290619, "learning_rate": 2.417548147713375e-05, "loss": 0.041, "step": 21400 }, { "epoch": 5.601897284919856, "grad_norm": 0.4324061870574951, "learning_rate": 2.4154824632396645e-05, "loss": 0.0394, "step": 21410 }, { "epoch": 5.604514229636899, "grad_norm": 0.49988454580307007, "learning_rate": 2.4134168365328925e-05, "loss": 0.0433, "step": 21420 }, { "epoch": 5.607131174353942, "grad_norm": 0.5067137479782104, "learning_rate": 2.411351269004897e-05, "loss": 0.0398, "step": 21430 }, { "epoch": 5.609748119070985, "grad_norm": 0.5921526551246643, "learning_rate": 2.4092857620674725e-05, "loss": 0.0411, "step": 21440 }, { "epoch": 5.612365063788028, "grad_norm": 0.3700776994228363, "learning_rate": 2.4072203171323748e-05, "loss": 0.0352, "step": 21450 }, { "epoch": 5.61498200850507, "grad_norm": 0.3668251633644104, "learning_rate": 2.405154935611315e-05, "loss": 0.0357, "step": 21460 }, { "epoch": 5.617598953222113, "grad_norm": 0.7859848141670227, "learning_rate": 2.403089618915963e-05, "loss": 0.0422, "step": 21470 }, { "epoch": 5.620215897939156, "grad_norm": 0.4612215459346771, "learning_rate": 2.401024368457942e-05, "loss": 0.0441, "step": 21480 }, { "epoch": 5.622832842656199, "grad_norm": 0.46146160364151, "learning_rate": 2.398959185648833e-05, "loss": 0.0383, "step": 21490 }, { "epoch": 5.625449787373242, "grad_norm": 0.5035638213157654, "learning_rate": 2.396894071900167e-05, "loss": 0.0337, "step": 21500 }, { "epoch": 5.628066732090285, "grad_norm": 0.37995830178260803, "learning_rate": 2.394829028623431e-05, "loss": 0.04, "step": 21510 }, { "epoch": 5.630683676807328, "grad_norm": 0.5449048280715942, "learning_rate": 2.3927640572300613e-05, "loss": 0.0452, "step": 21520 }, { "epoch": 5.63330062152437, "grad_norm": 0.4563062787055969, "learning_rate": 2.3906991591314485e-05, "loss": 0.0381, "step": 21530 }, { "epoch": 5.635917566241413, "grad_norm": 0.4141029417514801, "learning_rate": 2.388634335738929e-05, "loss": 0.0419, "step": 21540 }, { "epoch": 5.638534510958456, "grad_norm": 0.4503481984138489, "learning_rate": 2.386569588463791e-05, "loss": 0.0389, "step": 21550 }, { "epoch": 5.641151455675499, "grad_norm": 0.5653615593910217, "learning_rate": 2.3845049187172696e-05, "loss": 0.0381, "step": 21560 }, { "epoch": 5.643768400392542, "grad_norm": 0.36770060658454895, "learning_rate": 2.3824403279105474e-05, "loss": 0.0375, "step": 21570 }, { "epoch": 5.646385345109585, "grad_norm": 0.42274710536003113, "learning_rate": 2.380375817454754e-05, "loss": 0.0384, "step": 21580 }, { "epoch": 5.649002289826628, "grad_norm": 0.443455308675766, "learning_rate": 2.3783113887609595e-05, "loss": 0.0414, "step": 21590 }, { "epoch": 5.65161923454367, "grad_norm": 0.3747701644897461, "learning_rate": 2.376247043240184e-05, "loss": 0.0345, "step": 21600 }, { "epoch": 5.654236179260713, "grad_norm": 0.6288304924964905, "learning_rate": 2.3741827823033872e-05, "loss": 0.0398, "step": 21610 }, { "epoch": 5.656853123977756, "grad_norm": 0.29721206426620483, "learning_rate": 2.372118607361472e-05, "loss": 0.0399, "step": 21620 }, { "epoch": 5.659470068694799, "grad_norm": 0.4514649510383606, "learning_rate": 2.3700545198252836e-05, "loss": 0.0396, "step": 21630 }, { "epoch": 5.662087013411842, "grad_norm": 0.5440108180046082, "learning_rate": 2.367990521105605e-05, "loss": 0.0339, "step": 21640 }, { "epoch": 5.664703958128884, "grad_norm": 0.4818888008594513, "learning_rate": 2.365926612613161e-05, "loss": 0.0411, "step": 21650 }, { "epoch": 5.667320902845927, "grad_norm": 0.3079739511013031, "learning_rate": 2.3638627957586124e-05, "loss": 0.0367, "step": 21660 }, { "epoch": 5.66993784756297, "grad_norm": 0.4219807982444763, "learning_rate": 2.3617990719525594e-05, "loss": 0.0349, "step": 21670 }, { "epoch": 5.672554792280013, "grad_norm": 0.3461092710494995, "learning_rate": 2.3597354426055383e-05, "loss": 0.0423, "step": 21680 }, { "epoch": 5.675171736997056, "grad_norm": 0.3110405206680298, "learning_rate": 2.3576719091280193e-05, "loss": 0.033, "step": 21690 }, { "epoch": 5.677788681714099, "grad_norm": 0.6025128364562988, "learning_rate": 2.3556084729304074e-05, "loss": 0.0418, "step": 21700 }, { "epoch": 5.680405626431142, "grad_norm": 0.46335169672966003, "learning_rate": 2.353545135423044e-05, "loss": 0.0355, "step": 21710 }, { "epoch": 5.683022571148184, "grad_norm": 0.4924328625202179, "learning_rate": 2.3514818980161986e-05, "loss": 0.0411, "step": 21720 }, { "epoch": 5.685639515865227, "grad_norm": 0.3370501399040222, "learning_rate": 2.3494187621200757e-05, "loss": 0.0396, "step": 21730 }, { "epoch": 5.68825646058227, "grad_norm": 0.44423383474349976, "learning_rate": 2.347355729144809e-05, "loss": 0.0405, "step": 21740 }, { "epoch": 5.690873405299313, "grad_norm": 0.3318318724632263, "learning_rate": 2.3452928005004623e-05, "loss": 0.0389, "step": 21750 }, { "epoch": 5.693490350016356, "grad_norm": 0.5180429220199585, "learning_rate": 2.3432299775970274e-05, "loss": 0.0389, "step": 21760 }, { "epoch": 5.696107294733399, "grad_norm": 0.3516453504562378, "learning_rate": 2.3411672618444252e-05, "loss": 0.0374, "step": 21770 }, { "epoch": 5.698724239450442, "grad_norm": 0.4864489734172821, "learning_rate": 2.339104654652501e-05, "loss": 0.0367, "step": 21780 }, { "epoch": 5.701341184167484, "grad_norm": 0.3349211812019348, "learning_rate": 2.3370421574310286e-05, "loss": 0.0342, "step": 21790 }, { "epoch": 5.703958128884527, "grad_norm": 0.546351969242096, "learning_rate": 2.3349797715897044e-05, "loss": 0.0379, "step": 21800 }, { "epoch": 5.70657507360157, "grad_norm": 0.468004435300827, "learning_rate": 2.3329174985381514e-05, "loss": 0.0402, "step": 21810 }, { "epoch": 5.709192018318613, "grad_norm": 0.3514151871204376, "learning_rate": 2.3308553396859114e-05, "loss": 0.0371, "step": 21820 }, { "epoch": 5.711808963035656, "grad_norm": 0.28557074069976807, "learning_rate": 2.3287932964424526e-05, "loss": 0.0402, "step": 21830 }, { "epoch": 5.714425907752699, "grad_norm": 0.2928674519062042, "learning_rate": 2.326731370217161e-05, "loss": 0.0351, "step": 21840 }, { "epoch": 5.717042852469741, "grad_norm": 0.3764609396457672, "learning_rate": 2.3246695624193444e-05, "loss": 0.0391, "step": 21850 }, { "epoch": 5.719659797186784, "grad_norm": 0.37425652146339417, "learning_rate": 2.3226078744582287e-05, "loss": 0.039, "step": 21860 }, { "epoch": 5.722276741903827, "grad_norm": 0.5584398508071899, "learning_rate": 2.3205463077429578e-05, "loss": 0.0435, "step": 21870 }, { "epoch": 5.72489368662087, "grad_norm": 0.39990872144699097, "learning_rate": 2.318484863682593e-05, "loss": 0.0415, "step": 21880 }, { "epoch": 5.727510631337913, "grad_norm": 0.5545628070831299, "learning_rate": 2.316423543686113e-05, "loss": 0.0353, "step": 21890 }, { "epoch": 5.730127576054956, "grad_norm": 0.6198563575744629, "learning_rate": 2.314362349162409e-05, "loss": 0.0382, "step": 21900 }, { "epoch": 5.732744520771998, "grad_norm": 0.4582308530807495, "learning_rate": 2.3123012815202897e-05, "loss": 0.035, "step": 21910 }, { "epoch": 5.735361465489041, "grad_norm": 0.24980854988098145, "learning_rate": 2.3102403421684737e-05, "loss": 0.0358, "step": 21920 }, { "epoch": 5.737978410206084, "grad_norm": 0.32388317584991455, "learning_rate": 2.3081795325155955e-05, "loss": 0.0429, "step": 21930 }, { "epoch": 5.740595354923127, "grad_norm": 0.417156845331192, "learning_rate": 2.3061188539701973e-05, "loss": 0.0413, "step": 21940 }, { "epoch": 5.74321229964017, "grad_norm": 0.4360509216785431, "learning_rate": 2.3040583079407348e-05, "loss": 0.0358, "step": 21950 }, { "epoch": 5.745829244357213, "grad_norm": 0.26789259910583496, "learning_rate": 2.301997895835572e-05, "loss": 0.033, "step": 21960 }, { "epoch": 5.748446189074256, "grad_norm": 0.43363526463508606, "learning_rate": 2.2999376190629786e-05, "loss": 0.0419, "step": 21970 }, { "epoch": 5.751063133791298, "grad_norm": 0.46473145484924316, "learning_rate": 2.2978774790311365e-05, "loss": 0.0409, "step": 21980 }, { "epoch": 5.753680078508341, "grad_norm": 0.472726434469223, "learning_rate": 2.2958174771481324e-05, "loss": 0.038, "step": 21990 }, { "epoch": 5.756297023225384, "grad_norm": 0.35688552260398865, "learning_rate": 2.2937576148219564e-05, "loss": 0.0423, "step": 22000 }, { "epoch": 5.756297023225384, "eval_loss": 0.05000167867024996, "eval_runtime": 8.9268, "eval_samples_per_second": 114.711, "eval_steps_per_second": 1.792, "step": 22000 }, { "epoch": 5.758913967942427, "grad_norm": 0.4734850525856018, "learning_rate": 2.2916978934605065e-05, "loss": 0.0343, "step": 22010 }, { "epoch": 5.76153091265947, "grad_norm": 0.5695239901542664, "learning_rate": 2.289638314471582e-05, "loss": 0.0397, "step": 22020 }, { "epoch": 5.764147857376513, "grad_norm": 0.47718679904937744, "learning_rate": 2.287578879262886e-05, "loss": 0.0419, "step": 22030 }, { "epoch": 5.766764802093556, "grad_norm": 0.49803072214126587, "learning_rate": 2.285519589242023e-05, "loss": 0.0369, "step": 22040 }, { "epoch": 5.769381746810598, "grad_norm": 0.564346194267273, "learning_rate": 2.283460445816499e-05, "loss": 0.0372, "step": 22050 }, { "epoch": 5.771998691527641, "grad_norm": 0.2933105230331421, "learning_rate": 2.281401450393718e-05, "loss": 0.033, "step": 22060 }, { "epoch": 5.774615636244684, "grad_norm": 0.3057403862476349, "learning_rate": 2.279342604380984e-05, "loss": 0.0352, "step": 22070 }, { "epoch": 5.777232580961727, "grad_norm": 0.6289494037628174, "learning_rate": 2.277283909185499e-05, "loss": 0.0401, "step": 22080 }, { "epoch": 5.77984952567877, "grad_norm": 0.3429940640926361, "learning_rate": 2.275225366214363e-05, "loss": 0.0358, "step": 22090 }, { "epoch": 5.782466470395812, "grad_norm": 0.47569212317466736, "learning_rate": 2.2731669768745686e-05, "loss": 0.0364, "step": 22100 }, { "epoch": 5.785083415112855, "grad_norm": 0.5378376245498657, "learning_rate": 2.2711087425730077e-05, "loss": 0.0405, "step": 22110 }, { "epoch": 5.787700359829898, "grad_norm": 0.583967924118042, "learning_rate": 2.269050664716462e-05, "loss": 0.0357, "step": 22120 }, { "epoch": 5.790317304546941, "grad_norm": 0.374970406293869, "learning_rate": 2.2669927447116097e-05, "loss": 0.0401, "step": 22130 }, { "epoch": 5.792934249263984, "grad_norm": 0.4154004752635956, "learning_rate": 2.26493498396502e-05, "loss": 0.0381, "step": 22140 }, { "epoch": 5.795551193981027, "grad_norm": 0.29512980580329895, "learning_rate": 2.2628773838831512e-05, "loss": 0.038, "step": 22150 }, { "epoch": 5.79816813869807, "grad_norm": 0.5380761623382568, "learning_rate": 2.260819945872355e-05, "loss": 0.046, "step": 22160 }, { "epoch": 5.8007850834151125, "grad_norm": 0.36293846368789673, "learning_rate": 2.25876267133887e-05, "loss": 0.0348, "step": 22170 }, { "epoch": 5.803402028132155, "grad_norm": 0.3755585551261902, "learning_rate": 2.2567055616888244e-05, "loss": 0.0403, "step": 22180 }, { "epoch": 5.806018972849198, "grad_norm": 0.2739109694957733, "learning_rate": 2.2546486183282338e-05, "loss": 0.0334, "step": 22190 }, { "epoch": 5.808635917566241, "grad_norm": 0.6384417414665222, "learning_rate": 2.2525918426629984e-05, "loss": 0.0372, "step": 22200 }, { "epoch": 5.811252862283284, "grad_norm": 0.49684906005859375, "learning_rate": 2.2505352360989062e-05, "loss": 0.0418, "step": 22210 }, { "epoch": 5.813869807000327, "grad_norm": 0.24710462987422943, "learning_rate": 2.2484788000416275e-05, "loss": 0.0421, "step": 22220 }, { "epoch": 5.81648675171737, "grad_norm": 0.7345560789108276, "learning_rate": 2.2464225358967172e-05, "loss": 0.0359, "step": 22230 }, { "epoch": 5.8191036964344125, "grad_norm": 0.559100329875946, "learning_rate": 2.2443664450696136e-05, "loss": 0.0382, "step": 22240 }, { "epoch": 5.8217206411514555, "grad_norm": 0.42378759384155273, "learning_rate": 2.2423105289656332e-05, "loss": 0.034, "step": 22250 }, { "epoch": 5.8243375858684985, "grad_norm": 0.588720440864563, "learning_rate": 2.2402547889899766e-05, "loss": 0.0375, "step": 22260 }, { "epoch": 5.8269545305855415, "grad_norm": 0.518203616142273, "learning_rate": 2.2381992265477224e-05, "loss": 0.0349, "step": 22270 }, { "epoch": 5.8295714753025845, "grad_norm": 0.38674628734588623, "learning_rate": 2.236143843043828e-05, "loss": 0.0404, "step": 22280 }, { "epoch": 5.8321884200196275, "grad_norm": 0.4627288579940796, "learning_rate": 2.2340886398831294e-05, "loss": 0.0351, "step": 22290 }, { "epoch": 5.8348053647366696, "grad_norm": 0.32829442620277405, "learning_rate": 2.2320336184703373e-05, "loss": 0.044, "step": 22300 }, { "epoch": 5.8374223094537125, "grad_norm": 0.6168282628059387, "learning_rate": 2.229978780210041e-05, "loss": 0.0354, "step": 22310 }, { "epoch": 5.8400392541707555, "grad_norm": 0.3957426846027374, "learning_rate": 2.2279241265067015e-05, "loss": 0.0338, "step": 22320 }, { "epoch": 5.8426561988877985, "grad_norm": 0.4070344865322113, "learning_rate": 2.2258696587646573e-05, "loss": 0.0464, "step": 22330 }, { "epoch": 5.8452731436048415, "grad_norm": 0.43342939019203186, "learning_rate": 2.223815378388116e-05, "loss": 0.0362, "step": 22340 }, { "epoch": 5.8478900883218845, "grad_norm": 0.36159420013427734, "learning_rate": 2.221761286781159e-05, "loss": 0.0391, "step": 22350 }, { "epoch": 5.850507033038927, "grad_norm": 0.45029348134994507, "learning_rate": 2.2197073853477388e-05, "loss": 0.0376, "step": 22360 }, { "epoch": 5.85312397775597, "grad_norm": 0.39053642749786377, "learning_rate": 2.2176536754916775e-05, "loss": 0.0344, "step": 22370 }, { "epoch": 5.855740922473013, "grad_norm": 0.4565856456756592, "learning_rate": 2.2156001586166663e-05, "loss": 0.0412, "step": 22380 }, { "epoch": 5.858357867190056, "grad_norm": 0.49078816175460815, "learning_rate": 2.2135468361262656e-05, "loss": 0.0385, "step": 22390 }, { "epoch": 5.860974811907099, "grad_norm": 0.33995869755744934, "learning_rate": 2.211493709423901e-05, "loss": 0.0413, "step": 22400 }, { "epoch": 5.863591756624142, "grad_norm": 0.4810589849948883, "learning_rate": 2.2094407799128662e-05, "loss": 0.0372, "step": 22410 }, { "epoch": 5.8662087013411846, "grad_norm": 0.4321928918361664, "learning_rate": 2.207388048996319e-05, "loss": 0.045, "step": 22420 }, { "epoch": 5.868825646058227, "grad_norm": 0.3577482998371124, "learning_rate": 2.2053355180772797e-05, "loss": 0.0385, "step": 22430 }, { "epoch": 5.87144259077527, "grad_norm": 0.3033449947834015, "learning_rate": 2.203283188558636e-05, "loss": 0.0379, "step": 22440 }, { "epoch": 5.874059535492313, "grad_norm": 0.5386301279067993, "learning_rate": 2.201231061843135e-05, "loss": 0.0424, "step": 22450 }, { "epoch": 5.876676480209356, "grad_norm": 0.2471160590648651, "learning_rate": 2.1991791393333858e-05, "loss": 0.0344, "step": 22460 }, { "epoch": 5.879293424926399, "grad_norm": 0.2815709114074707, "learning_rate": 2.197127422431858e-05, "loss": 0.0374, "step": 22470 }, { "epoch": 5.881910369643442, "grad_norm": 0.38048800826072693, "learning_rate": 2.195075912540881e-05, "loss": 0.0351, "step": 22480 }, { "epoch": 5.884527314360485, "grad_norm": 0.2458164393901825, "learning_rate": 2.193024611062643e-05, "loss": 0.0333, "step": 22490 }, { "epoch": 5.887144259077527, "grad_norm": 0.4755036234855652, "learning_rate": 2.1909735193991887e-05, "loss": 0.0376, "step": 22500 }, { "epoch": 5.88976120379457, "grad_norm": 0.4101316034793854, "learning_rate": 2.1889226389524206e-05, "loss": 0.0351, "step": 22510 }, { "epoch": 5.892378148511613, "grad_norm": 0.4491457939147949, "learning_rate": 2.186871971124095e-05, "loss": 0.0396, "step": 22520 }, { "epoch": 5.894995093228656, "grad_norm": 0.39417290687561035, "learning_rate": 2.184821517315824e-05, "loss": 0.0368, "step": 22530 }, { "epoch": 5.897612037945699, "grad_norm": 0.3355071246623993, "learning_rate": 2.1827712789290746e-05, "loss": 0.0386, "step": 22540 }, { "epoch": 5.900228982662741, "grad_norm": 0.3364246189594269, "learning_rate": 2.1807212573651644e-05, "loss": 0.0394, "step": 22550 }, { "epoch": 5.902845927379784, "grad_norm": 0.41326799988746643, "learning_rate": 2.178671454025264e-05, "loss": 0.0345, "step": 22560 }, { "epoch": 5.905462872096827, "grad_norm": 0.41661393642425537, "learning_rate": 2.1766218703103948e-05, "loss": 0.0376, "step": 22570 }, { "epoch": 5.90807981681387, "grad_norm": 0.3999796509742737, "learning_rate": 2.174572507621428e-05, "loss": 0.0359, "step": 22580 }, { "epoch": 5.910696761530913, "grad_norm": 0.5220701694488525, "learning_rate": 2.172523367359084e-05, "loss": 0.0398, "step": 22590 }, { "epoch": 5.913313706247956, "grad_norm": 0.4379059374332428, "learning_rate": 2.17047445092393e-05, "loss": 0.0364, "step": 22600 }, { "epoch": 5.915930650964999, "grad_norm": 0.3713061213493347, "learning_rate": 2.1684257597163826e-05, "loss": 0.0397, "step": 22610 }, { "epoch": 5.918547595682041, "grad_norm": 0.3561409115791321, "learning_rate": 2.1663772951367014e-05, "loss": 0.0353, "step": 22620 }, { "epoch": 5.921164540399084, "grad_norm": 0.47510769963264465, "learning_rate": 2.1643290585849927e-05, "loss": 0.0423, "step": 22630 }, { "epoch": 5.923781485116127, "grad_norm": 0.5692808032035828, "learning_rate": 2.162281051461208e-05, "loss": 0.0362, "step": 22640 }, { "epoch": 5.92639842983317, "grad_norm": 0.5427606105804443, "learning_rate": 2.160233275165139e-05, "loss": 0.038, "step": 22650 }, { "epoch": 5.929015374550213, "grad_norm": 0.3027240037918091, "learning_rate": 2.1581857310964233e-05, "loss": 0.039, "step": 22660 }, { "epoch": 5.931632319267256, "grad_norm": 0.2940119504928589, "learning_rate": 2.156138420654537e-05, "loss": 0.0384, "step": 22670 }, { "epoch": 5.934249263984299, "grad_norm": 0.521510660648346, "learning_rate": 2.1540913452387972e-05, "loss": 0.0343, "step": 22680 }, { "epoch": 5.936866208701341, "grad_norm": 0.2505738139152527, "learning_rate": 2.1520445062483623e-05, "loss": 0.0379, "step": 22690 }, { "epoch": 5.939483153418384, "grad_norm": 0.4490431845188141, "learning_rate": 2.1499979050822268e-05, "loss": 0.0388, "step": 22700 }, { "epoch": 5.942100098135427, "grad_norm": 0.3115292191505432, "learning_rate": 2.1479515431392217e-05, "loss": 0.0399, "step": 22710 }, { "epoch": 5.94471704285247, "grad_norm": 0.3649538457393646, "learning_rate": 2.145905421818018e-05, "loss": 0.0374, "step": 22720 }, { "epoch": 5.947333987569513, "grad_norm": 0.2825697362422943, "learning_rate": 2.1438595425171188e-05, "loss": 0.0352, "step": 22730 }, { "epoch": 5.949950932286556, "grad_norm": 0.32974621653556824, "learning_rate": 2.1418139066348647e-05, "loss": 0.0351, "step": 22740 }, { "epoch": 5.952567877003599, "grad_norm": 0.3849112391471863, "learning_rate": 2.1397685155694274e-05, "loss": 0.0397, "step": 22750 }, { "epoch": 5.955184821720641, "grad_norm": 0.5038356781005859, "learning_rate": 2.1377233707188126e-05, "loss": 0.0408, "step": 22760 }, { "epoch": 5.957801766437684, "grad_norm": 0.3917370140552521, "learning_rate": 2.1356784734808588e-05, "loss": 0.037, "step": 22770 }, { "epoch": 5.960418711154727, "grad_norm": 0.34260237216949463, "learning_rate": 2.1336338252532324e-05, "loss": 0.0362, "step": 22780 }, { "epoch": 5.96303565587177, "grad_norm": 0.3475625813007355, "learning_rate": 2.131589427433433e-05, "loss": 0.0364, "step": 22790 }, { "epoch": 5.965652600588813, "grad_norm": 0.3870244324207306, "learning_rate": 2.1295452814187854e-05, "loss": 0.0357, "step": 22800 }, { "epoch": 5.968269545305855, "grad_norm": 0.3478778004646301, "learning_rate": 2.127501388606444e-05, "loss": 0.0324, "step": 22810 }, { "epoch": 5.970886490022898, "grad_norm": 0.4857855439186096, "learning_rate": 2.1254577503933916e-05, "loss": 0.0384, "step": 22820 }, { "epoch": 5.973503434739941, "grad_norm": 0.4679870009422302, "learning_rate": 2.123414368176435e-05, "loss": 0.028, "step": 22830 }, { "epoch": 5.976120379456984, "grad_norm": 0.32966911792755127, "learning_rate": 2.121371243352207e-05, "loss": 0.0338, "step": 22840 }, { "epoch": 5.978737324174027, "grad_norm": 0.4067830741405487, "learning_rate": 2.1193283773171636e-05, "loss": 0.0365, "step": 22850 }, { "epoch": 5.98135426889107, "grad_norm": 0.26921984553337097, "learning_rate": 2.117285771467584e-05, "loss": 0.0366, "step": 22860 }, { "epoch": 5.983971213608113, "grad_norm": 0.4890765845775604, "learning_rate": 2.115243427199572e-05, "loss": 0.0364, "step": 22870 }, { "epoch": 5.986588158325155, "grad_norm": 0.44049689173698425, "learning_rate": 2.113201345909049e-05, "loss": 0.0404, "step": 22880 }, { "epoch": 5.989205103042198, "grad_norm": 0.3951586186885834, "learning_rate": 2.1111595289917598e-05, "loss": 0.0369, "step": 22890 }, { "epoch": 5.991822047759241, "grad_norm": 0.40105995535850525, "learning_rate": 2.1091179778432655e-05, "loss": 0.0353, "step": 22900 }, { "epoch": 5.994438992476284, "grad_norm": 0.38641372323036194, "learning_rate": 2.107076693858947e-05, "loss": 0.0374, "step": 22910 }, { "epoch": 5.997055937193327, "grad_norm": 0.36292564868927, "learning_rate": 2.1050356784340035e-05, "loss": 0.0344, "step": 22920 }, { "epoch": 5.99967288191037, "grad_norm": 0.3365738093852997, "learning_rate": 2.102994932963449e-05, "loss": 0.0375, "step": 22930 }, { "epoch": 6.002093555773635, "grad_norm": 0.47521135210990906, "learning_rate": 2.1009544588421147e-05, "loss": 0.0386, "step": 22940 }, { "epoch": 6.004710500490678, "grad_norm": 0.5351961255073547, "learning_rate": 2.0989142574646447e-05, "loss": 0.0331, "step": 22950 }, { "epoch": 6.00732744520772, "grad_norm": 0.3139323890209198, "learning_rate": 2.096874330225498e-05, "loss": 0.0357, "step": 22960 }, { "epoch": 6.009944389924763, "grad_norm": 0.42696279287338257, "learning_rate": 2.0948346785189455e-05, "loss": 0.0342, "step": 22970 }, { "epoch": 6.012561334641806, "grad_norm": 0.44340917468070984, "learning_rate": 2.0927953037390702e-05, "loss": 0.0366, "step": 22980 }, { "epoch": 6.015178279358849, "grad_norm": 0.5112841725349426, "learning_rate": 2.0907562072797642e-05, "loss": 0.0391, "step": 22990 }, { "epoch": 6.017795224075892, "grad_norm": 0.5477259159088135, "learning_rate": 2.0887173905347322e-05, "loss": 0.0361, "step": 23000 }, { "epoch": 6.017795224075892, "eval_loss": 0.04335802614650672, "eval_runtime": 9.204, "eval_samples_per_second": 111.256, "eval_steps_per_second": 1.738, "step": 23000 }, { "epoch": 6.020412168792935, "grad_norm": 0.43827566504478455, "learning_rate": 2.086678854897485e-05, "loss": 0.0352, "step": 23010 }, { "epoch": 6.023029113509977, "grad_norm": 0.4269797205924988, "learning_rate": 2.0846406017613434e-05, "loss": 0.0371, "step": 23020 }, { "epoch": 6.02564605822702, "grad_norm": 0.43934446573257446, "learning_rate": 2.0826026325194337e-05, "loss": 0.0359, "step": 23030 }, { "epoch": 6.028263002944063, "grad_norm": 0.3212452232837677, "learning_rate": 2.0805649485646893e-05, "loss": 0.0385, "step": 23040 }, { "epoch": 6.030879947661106, "grad_norm": 0.3714478611946106, "learning_rate": 2.0785275512898467e-05, "loss": 0.0417, "step": 23050 }, { "epoch": 6.033496892378149, "grad_norm": 0.32258906960487366, "learning_rate": 2.0764904420874486e-05, "loss": 0.0378, "step": 23060 }, { "epoch": 6.036113837095192, "grad_norm": 0.3787921965122223, "learning_rate": 2.074453622349841e-05, "loss": 0.0318, "step": 23070 }, { "epoch": 6.038730781812234, "grad_norm": 0.6138239502906799, "learning_rate": 2.0724170934691698e-05, "loss": 0.0325, "step": 23080 }, { "epoch": 6.041347726529277, "grad_norm": 0.42927268147468567, "learning_rate": 2.0703808568373824e-05, "loss": 0.0408, "step": 23090 }, { "epoch": 6.04396467124632, "grad_norm": 0.6134794354438782, "learning_rate": 2.0683449138462287e-05, "loss": 0.0332, "step": 23100 }, { "epoch": 6.046581615963363, "grad_norm": 0.5098093748092651, "learning_rate": 2.066309265887256e-05, "loss": 0.0428, "step": 23110 }, { "epoch": 6.049198560680406, "grad_norm": 0.3562641143798828, "learning_rate": 2.064273914351811e-05, "loss": 0.0375, "step": 23120 }, { "epoch": 6.051815505397449, "grad_norm": 0.31301990151405334, "learning_rate": 2.0622388606310363e-05, "loss": 0.0354, "step": 23130 }, { "epoch": 6.054432450114492, "grad_norm": 0.35312601923942566, "learning_rate": 2.060204106115873e-05, "loss": 0.034, "step": 23140 }, { "epoch": 6.057049394831534, "grad_norm": 0.6812915802001953, "learning_rate": 2.0581696521970554e-05, "loss": 0.0374, "step": 23150 }, { "epoch": 6.059666339548577, "grad_norm": 0.6285141706466675, "learning_rate": 2.0561355002651145e-05, "loss": 0.0448, "step": 23160 }, { "epoch": 6.06228328426562, "grad_norm": 0.5887035727500916, "learning_rate": 2.054101651710375e-05, "loss": 0.0339, "step": 23170 }, { "epoch": 6.064900228982663, "grad_norm": 0.8369037508964539, "learning_rate": 2.0520681079229513e-05, "loss": 0.0394, "step": 23180 }, { "epoch": 6.067517173699706, "grad_norm": 0.3949189782142639, "learning_rate": 2.0500348702927512e-05, "loss": 0.0341, "step": 23190 }, { "epoch": 6.070134118416749, "grad_norm": 0.2599748969078064, "learning_rate": 2.0480019402094755e-05, "loss": 0.0379, "step": 23200 }, { "epoch": 6.072751063133791, "grad_norm": 0.3696807026863098, "learning_rate": 2.0459693190626107e-05, "loss": 0.0312, "step": 23210 }, { "epoch": 6.075368007850834, "grad_norm": 0.3513152599334717, "learning_rate": 2.043937008241436e-05, "loss": 0.0329, "step": 23220 }, { "epoch": 6.077984952567877, "grad_norm": 0.28093549609184265, "learning_rate": 2.0419050091350148e-05, "loss": 0.0325, "step": 23230 }, { "epoch": 6.08060189728492, "grad_norm": 0.37431958317756653, "learning_rate": 2.039873323132201e-05, "loss": 0.0351, "step": 23240 }, { "epoch": 6.083218842001963, "grad_norm": 0.44764524698257446, "learning_rate": 2.037841951621631e-05, "loss": 0.0374, "step": 23250 }, { "epoch": 6.085835786719006, "grad_norm": 0.39713025093078613, "learning_rate": 2.035810895991731e-05, "loss": 0.0345, "step": 23260 }, { "epoch": 6.088452731436049, "grad_norm": 0.35608890652656555, "learning_rate": 2.033780157630705e-05, "loss": 0.0398, "step": 23270 }, { "epoch": 6.091069676153091, "grad_norm": 0.18921782076358795, "learning_rate": 2.031749737926546e-05, "loss": 0.0311, "step": 23280 }, { "epoch": 6.093686620870134, "grad_norm": 0.3688729703426361, "learning_rate": 2.0297196382670253e-05, "loss": 0.0361, "step": 23290 }, { "epoch": 6.096303565587177, "grad_norm": 0.3176236152648926, "learning_rate": 2.0276898600396977e-05, "loss": 0.0325, "step": 23300 }, { "epoch": 6.09892051030422, "grad_norm": 0.4786258935928345, "learning_rate": 2.0256604046318963e-05, "loss": 0.0385, "step": 23310 }, { "epoch": 6.101537455021263, "grad_norm": 0.2971251308917999, "learning_rate": 2.0236312734307367e-05, "loss": 0.031, "step": 23320 }, { "epoch": 6.104154399738306, "grad_norm": 0.44764214754104614, "learning_rate": 2.021602467823109e-05, "loss": 0.0342, "step": 23330 }, { "epoch": 6.106771344455348, "grad_norm": 0.3997710049152374, "learning_rate": 2.0195739891956838e-05, "loss": 0.0359, "step": 23340 }, { "epoch": 6.109388289172391, "grad_norm": 0.5370054244995117, "learning_rate": 2.0175458389349077e-05, "loss": 0.0353, "step": 23350 }, { "epoch": 6.112005233889434, "grad_norm": 0.5448628067970276, "learning_rate": 2.0155180184270003e-05, "loss": 0.0347, "step": 23360 }, { "epoch": 6.114622178606477, "grad_norm": 0.5665889978408813, "learning_rate": 2.013490529057959e-05, "loss": 0.0358, "step": 23370 }, { "epoch": 6.11723912332352, "grad_norm": 0.3886018693447113, "learning_rate": 2.011463372213554e-05, "loss": 0.0355, "step": 23380 }, { "epoch": 6.119856068040563, "grad_norm": 0.3535195589065552, "learning_rate": 2.009436549279327e-05, "loss": 0.0353, "step": 23390 }, { "epoch": 6.122473012757606, "grad_norm": 0.26383087038993835, "learning_rate": 2.007410061640593e-05, "loss": 0.0333, "step": 23400 }, { "epoch": 6.125089957474648, "grad_norm": 0.46765440702438354, "learning_rate": 2.0053839106824368e-05, "loss": 0.0379, "step": 23410 }, { "epoch": 6.127706902191691, "grad_norm": 0.7617926597595215, "learning_rate": 2.003358097789714e-05, "loss": 0.0375, "step": 23420 }, { "epoch": 6.130323846908734, "grad_norm": 0.4366847276687622, "learning_rate": 2.001332624347048e-05, "loss": 0.0352, "step": 23430 }, { "epoch": 6.132940791625777, "grad_norm": 0.4177016317844391, "learning_rate": 1.999307491738832e-05, "loss": 0.0326, "step": 23440 }, { "epoch": 6.13555773634282, "grad_norm": 0.4015594720840454, "learning_rate": 1.997282701349224e-05, "loss": 0.0351, "step": 23450 }, { "epoch": 6.138174681059863, "grad_norm": 0.24507103860378265, "learning_rate": 1.9952582545621487e-05, "loss": 0.0334, "step": 23460 }, { "epoch": 6.140791625776905, "grad_norm": 0.2932671010494232, "learning_rate": 1.9932341527612968e-05, "loss": 0.0396, "step": 23470 }, { "epoch": 6.143408570493948, "grad_norm": 0.4256709814071655, "learning_rate": 1.9912103973301236e-05, "loss": 0.0403, "step": 23480 }, { "epoch": 6.146025515210991, "grad_norm": 0.4459935128688812, "learning_rate": 1.9891869896518455e-05, "loss": 0.0316, "step": 23490 }, { "epoch": 6.148642459928034, "grad_norm": 0.3941150903701782, "learning_rate": 1.987163931109444e-05, "loss": 0.0355, "step": 23500 }, { "epoch": 6.151259404645077, "grad_norm": 0.3339027166366577, "learning_rate": 1.985141223085659e-05, "loss": 0.0358, "step": 23510 }, { "epoch": 6.15387634936212, "grad_norm": 0.27572101354599, "learning_rate": 1.983118866962994e-05, "loss": 0.0369, "step": 23520 }, { "epoch": 6.156493294079162, "grad_norm": 0.5284891128540039, "learning_rate": 1.981096864123709e-05, "loss": 0.0408, "step": 23530 }, { "epoch": 6.159110238796205, "grad_norm": 0.4090319275856018, "learning_rate": 1.9790752159498255e-05, "loss": 0.0364, "step": 23540 }, { "epoch": 6.161727183513248, "grad_norm": 0.39981624484062195, "learning_rate": 1.977053923823119e-05, "loss": 0.0425, "step": 23550 }, { "epoch": 6.164344128230291, "grad_norm": 0.48455047607421875, "learning_rate": 1.9750329891251244e-05, "loss": 0.0376, "step": 23560 }, { "epoch": 6.166961072947334, "grad_norm": 0.4755387008190155, "learning_rate": 1.9730124132371312e-05, "loss": 0.0379, "step": 23570 }, { "epoch": 6.169578017664377, "grad_norm": 0.35988909006118774, "learning_rate": 1.9709921975401854e-05, "loss": 0.0317, "step": 23580 }, { "epoch": 6.17219496238142, "grad_norm": 0.38816285133361816, "learning_rate": 1.9689723434150835e-05, "loss": 0.0365, "step": 23590 }, { "epoch": 6.174811907098462, "grad_norm": 0.40134334564208984, "learning_rate": 1.966952852242378e-05, "loss": 0.0367, "step": 23600 }, { "epoch": 6.177428851815505, "grad_norm": 0.32368433475494385, "learning_rate": 1.9649337254023713e-05, "loss": 0.0328, "step": 23610 }, { "epoch": 6.180045796532548, "grad_norm": 0.4156053066253662, "learning_rate": 1.9629149642751185e-05, "loss": 0.0357, "step": 23620 }, { "epoch": 6.182662741249591, "grad_norm": 0.4975856840610504, "learning_rate": 1.9608965702404236e-05, "loss": 0.0358, "step": 23630 }, { "epoch": 6.185279685966634, "grad_norm": 0.4767300486564636, "learning_rate": 1.9588785446778384e-05, "loss": 0.033, "step": 23640 }, { "epoch": 6.187896630683677, "grad_norm": 0.3882130980491638, "learning_rate": 1.9568608889666663e-05, "loss": 0.0347, "step": 23650 }, { "epoch": 6.19051357540072, "grad_norm": 0.2682572603225708, "learning_rate": 1.9548436044859542e-05, "loss": 0.04, "step": 23660 }, { "epoch": 6.193130520117762, "grad_norm": 0.4771081209182739, "learning_rate": 1.952826692614498e-05, "loss": 0.042, "step": 23670 }, { "epoch": 6.195747464834805, "grad_norm": 0.3116135001182556, "learning_rate": 1.9508101547308384e-05, "loss": 0.0311, "step": 23680 }, { "epoch": 6.198364409551848, "grad_norm": 0.24202032387256622, "learning_rate": 1.948793992213259e-05, "loss": 0.031, "step": 23690 }, { "epoch": 6.200981354268891, "grad_norm": 0.4628809988498688, "learning_rate": 1.9467782064397886e-05, "loss": 0.0355, "step": 23700 }, { "epoch": 6.203598298985934, "grad_norm": 0.2606217563152313, "learning_rate": 1.9447627987881974e-05, "loss": 0.0342, "step": 23710 }, { "epoch": 6.206215243702977, "grad_norm": 0.4413122832775116, "learning_rate": 1.9427477706359982e-05, "loss": 0.0395, "step": 23720 }, { "epoch": 6.208832188420019, "grad_norm": 0.26698562502861023, "learning_rate": 1.9407331233604434e-05, "loss": 0.0294, "step": 23730 }, { "epoch": 6.211449133137062, "grad_norm": 0.3271200358867645, "learning_rate": 1.9387188583385242e-05, "loss": 0.0362, "step": 23740 }, { "epoch": 6.214066077854105, "grad_norm": 0.37548309564590454, "learning_rate": 1.9367049769469737e-05, "loss": 0.0383, "step": 23750 }, { "epoch": 6.216683022571148, "grad_norm": 0.34529152512550354, "learning_rate": 1.934691480562259e-05, "loss": 0.0312, "step": 23760 }, { "epoch": 6.219299967288191, "grad_norm": 0.24557435512542725, "learning_rate": 1.9326783705605868e-05, "loss": 0.034, "step": 23770 }, { "epoch": 6.221916912005234, "grad_norm": 0.3468078672885895, "learning_rate": 1.9306656483178993e-05, "loss": 0.0382, "step": 23780 }, { "epoch": 6.224533856722276, "grad_norm": 0.419986754655838, "learning_rate": 1.9286533152098724e-05, "loss": 0.0343, "step": 23790 }, { "epoch": 6.227150801439319, "grad_norm": 0.40486636757850647, "learning_rate": 1.926641372611917e-05, "loss": 0.0303, "step": 23800 }, { "epoch": 6.229767746156362, "grad_norm": 0.30131152272224426, "learning_rate": 1.9246298218991773e-05, "loss": 0.0373, "step": 23810 }, { "epoch": 6.232384690873405, "grad_norm": 0.3338201642036438, "learning_rate": 1.9226186644465293e-05, "loss": 0.0383, "step": 23820 }, { "epoch": 6.235001635590448, "grad_norm": 0.33441248536109924, "learning_rate": 1.9206079016285796e-05, "loss": 0.0371, "step": 23830 }, { "epoch": 6.237618580307491, "grad_norm": 0.365160197019577, "learning_rate": 1.918597534819665e-05, "loss": 0.0372, "step": 23840 }, { "epoch": 6.240235525024534, "grad_norm": 0.4978759288787842, "learning_rate": 1.9165875653938543e-05, "loss": 0.0372, "step": 23850 }, { "epoch": 6.242852469741576, "grad_norm": 0.37960320711135864, "learning_rate": 1.91457799472494e-05, "loss": 0.0372, "step": 23860 }, { "epoch": 6.245469414458619, "grad_norm": 0.4017830193042755, "learning_rate": 1.9125688241864464e-05, "loss": 0.0326, "step": 23870 }, { "epoch": 6.248086359175662, "grad_norm": 0.36038631200790405, "learning_rate": 1.9105600551516232e-05, "loss": 0.0338, "step": 23880 }, { "epoch": 6.250703303892705, "grad_norm": 0.5656988024711609, "learning_rate": 1.9085516889934433e-05, "loss": 0.0334, "step": 23890 }, { "epoch": 6.253320248609748, "grad_norm": 0.6395511627197266, "learning_rate": 1.9065437270846076e-05, "loss": 0.038, "step": 23900 }, { "epoch": 6.255937193326791, "grad_norm": 0.30840107798576355, "learning_rate": 1.904536170797539e-05, "loss": 0.0354, "step": 23910 }, { "epoch": 6.258554138043833, "grad_norm": 0.29775843024253845, "learning_rate": 1.9025290215043818e-05, "loss": 0.0313, "step": 23920 }, { "epoch": 6.261171082760876, "grad_norm": 0.28461727499961853, "learning_rate": 1.9005222805770048e-05, "loss": 0.0315, "step": 23930 }, { "epoch": 6.263788027477919, "grad_norm": 0.2945394814014435, "learning_rate": 1.898515949386996e-05, "loss": 0.0353, "step": 23940 }, { "epoch": 6.266404972194962, "grad_norm": 0.41512030363082886, "learning_rate": 1.8965100293056644e-05, "loss": 0.0352, "step": 23950 }, { "epoch": 6.269021916912005, "grad_norm": 0.4127851724624634, "learning_rate": 1.894504521704037e-05, "loss": 0.0386, "step": 23960 }, { "epoch": 6.271638861629048, "grad_norm": 0.5242753624916077, "learning_rate": 1.8924994279528597e-05, "loss": 0.0391, "step": 23970 }, { "epoch": 6.2742558063460905, "grad_norm": 0.41487598419189453, "learning_rate": 1.890494749422595e-05, "loss": 0.0358, "step": 23980 }, { "epoch": 6.2768727510631335, "grad_norm": 0.3981493413448334, "learning_rate": 1.8884904874834216e-05, "loss": 0.035, "step": 23990 }, { "epoch": 6.2794896957801765, "grad_norm": 0.315698504447937, "learning_rate": 1.886486643505234e-05, "loss": 0.0308, "step": 24000 }, { "epoch": 6.2794896957801765, "eval_loss": 0.043558861109126004, "eval_runtime": 9.1848, "eval_samples_per_second": 111.489, "eval_steps_per_second": 1.742, "step": 24000 }, { "epoch": 6.2821066404972195, "grad_norm": 0.4844658374786377, "learning_rate": 1.8844832188576416e-05, "loss": 0.0406, "step": 24010 }, { "epoch": 6.284723585214262, "grad_norm": 0.3282155990600586, "learning_rate": 1.8824802149099637e-05, "loss": 0.0365, "step": 24020 }, { "epoch": 6.287340529931305, "grad_norm": 0.3532108962535858, "learning_rate": 1.8804776330312364e-05, "loss": 0.0312, "step": 24030 }, { "epoch": 6.289957474648348, "grad_norm": 0.39708057045936584, "learning_rate": 1.878475474590205e-05, "loss": 0.032, "step": 24040 }, { "epoch": 6.2925744193653905, "grad_norm": 0.4074369966983795, "learning_rate": 1.876473740955326e-05, "loss": 0.0335, "step": 24050 }, { "epoch": 6.2951913640824335, "grad_norm": 0.42009133100509644, "learning_rate": 1.8744724334947662e-05, "loss": 0.0336, "step": 24060 }, { "epoch": 6.2978083087994765, "grad_norm": 0.48954713344573975, "learning_rate": 1.872471553576399e-05, "loss": 0.0358, "step": 24070 }, { "epoch": 6.3004252535165195, "grad_norm": 0.391671746969223, "learning_rate": 1.8704711025678082e-05, "loss": 0.0335, "step": 24080 }, { "epoch": 6.3030421982335625, "grad_norm": 0.6001835465431213, "learning_rate": 1.868471081836282e-05, "loss": 0.0343, "step": 24090 }, { "epoch": 6.3056591429506055, "grad_norm": 0.40074849128723145, "learning_rate": 1.866471492748818e-05, "loss": 0.0352, "step": 24100 }, { "epoch": 6.3082760876676485, "grad_norm": 0.3913089632987976, "learning_rate": 1.864472336672114e-05, "loss": 0.0326, "step": 24110 }, { "epoch": 6.310893032384691, "grad_norm": 0.26605844497680664, "learning_rate": 1.862473614972575e-05, "loss": 0.0331, "step": 24120 }, { "epoch": 6.313509977101734, "grad_norm": 0.3286307156085968, "learning_rate": 1.8604753290163086e-05, "loss": 0.0325, "step": 24130 }, { "epoch": 6.3161269218187766, "grad_norm": 0.30002516508102417, "learning_rate": 1.8584774801691244e-05, "loss": 0.0358, "step": 24140 }, { "epoch": 6.3187438665358195, "grad_norm": 0.39178580045700073, "learning_rate": 1.856480069796533e-05, "loss": 0.0296, "step": 24150 }, { "epoch": 6.3213608112528625, "grad_norm": 0.2837730050086975, "learning_rate": 1.8544830992637465e-05, "loss": 0.0333, "step": 24160 }, { "epoch": 6.3239777559699055, "grad_norm": 0.342427521944046, "learning_rate": 1.8524865699356745e-05, "loss": 0.0344, "step": 24170 }, { "epoch": 6.326594700686948, "grad_norm": 0.35260245203971863, "learning_rate": 1.8504904831769265e-05, "loss": 0.0341, "step": 24180 }, { "epoch": 6.329211645403991, "grad_norm": 0.37071171402931213, "learning_rate": 1.8484948403518095e-05, "loss": 0.034, "step": 24190 }, { "epoch": 6.331828590121034, "grad_norm": 0.32197433710098267, "learning_rate": 1.846499642824325e-05, "loss": 0.0374, "step": 24200 }, { "epoch": 6.334445534838077, "grad_norm": 0.31927716732025146, "learning_rate": 1.8445048919581724e-05, "loss": 0.0317, "step": 24210 }, { "epoch": 6.33706247955512, "grad_norm": 0.3790701925754547, "learning_rate": 1.8425105891167448e-05, "loss": 0.0365, "step": 24220 }, { "epoch": 6.339679424272163, "grad_norm": 0.47591888904571533, "learning_rate": 1.8405167356631304e-05, "loss": 0.0363, "step": 24230 }, { "epoch": 6.342296368989205, "grad_norm": 0.27775129675865173, "learning_rate": 1.838523332960108e-05, "loss": 0.0331, "step": 24240 }, { "epoch": 6.344913313706248, "grad_norm": 0.4258742928504944, "learning_rate": 1.8365303823701503e-05, "loss": 0.0286, "step": 24250 }, { "epoch": 6.347530258423291, "grad_norm": 0.2823595404624939, "learning_rate": 1.8345378852554208e-05, "loss": 0.0351, "step": 24260 }, { "epoch": 6.350147203140334, "grad_norm": 0.3638388216495514, "learning_rate": 1.832545842977771e-05, "loss": 0.0333, "step": 24270 }, { "epoch": 6.352764147857377, "grad_norm": 0.3300917148590088, "learning_rate": 1.8305542568987448e-05, "loss": 0.0355, "step": 24280 }, { "epoch": 6.35538109257442, "grad_norm": 0.3087752163410187, "learning_rate": 1.8285631283795714e-05, "loss": 0.032, "step": 24290 }, { "epoch": 6.357998037291463, "grad_norm": 0.32038813829421997, "learning_rate": 1.8265724587811676e-05, "loss": 0.0358, "step": 24300 }, { "epoch": 6.360614982008505, "grad_norm": 0.45256373286247253, "learning_rate": 1.8245822494641384e-05, "loss": 0.0428, "step": 24310 }, { "epoch": 6.363231926725548, "grad_norm": 0.519258975982666, "learning_rate": 1.822592501788773e-05, "loss": 0.0378, "step": 24320 }, { "epoch": 6.365848871442591, "grad_norm": 0.5432097315788269, "learning_rate": 1.8206032171150453e-05, "loss": 0.0311, "step": 24330 }, { "epoch": 6.368465816159634, "grad_norm": 0.3527891933917999, "learning_rate": 1.818614396802612e-05, "loss": 0.0329, "step": 24340 }, { "epoch": 6.371082760876677, "grad_norm": 0.6945310235023499, "learning_rate": 1.8166260422108132e-05, "loss": 0.0359, "step": 24350 }, { "epoch": 6.37369970559372, "grad_norm": 0.528319776058197, "learning_rate": 1.8146381546986712e-05, "loss": 0.0378, "step": 24360 }, { "epoch": 6.376316650310762, "grad_norm": 0.4661575257778168, "learning_rate": 1.8126507356248877e-05, "loss": 0.0368, "step": 24370 }, { "epoch": 6.378933595027805, "grad_norm": 0.3477420508861542, "learning_rate": 1.810663786347846e-05, "loss": 0.0323, "step": 24380 }, { "epoch": 6.381550539744848, "grad_norm": 0.38132211565971375, "learning_rate": 1.8086773082256054e-05, "loss": 0.0319, "step": 24390 }, { "epoch": 6.384167484461891, "grad_norm": 0.41567564010620117, "learning_rate": 1.8066913026159058e-05, "loss": 0.0346, "step": 24400 }, { "epoch": 6.386784429178934, "grad_norm": 0.38132917881011963, "learning_rate": 1.8047057708761637e-05, "loss": 0.0444, "step": 24410 }, { "epoch": 6.389401373895977, "grad_norm": 0.31965717673301697, "learning_rate": 1.8027207143634702e-05, "loss": 0.029, "step": 24420 }, { "epoch": 6.392018318613019, "grad_norm": 0.1878511756658554, "learning_rate": 1.800736134434594e-05, "loss": 0.0331, "step": 24430 }, { "epoch": 6.394635263330062, "grad_norm": 0.32016149163246155, "learning_rate": 1.798752032445976e-05, "loss": 0.0393, "step": 24440 }, { "epoch": 6.397252208047105, "grad_norm": 0.6267549395561218, "learning_rate": 1.7967684097537318e-05, "loss": 0.0399, "step": 24450 }, { "epoch": 6.399869152764148, "grad_norm": 0.5221057534217834, "learning_rate": 1.7947852677136485e-05, "loss": 0.0327, "step": 24460 }, { "epoch": 6.402486097481191, "grad_norm": 0.3823908865451813, "learning_rate": 1.7928026076811854e-05, "loss": 0.0277, "step": 24470 }, { "epoch": 6.405103042198234, "grad_norm": 0.31546902656555176, "learning_rate": 1.7908204310114707e-05, "loss": 0.0381, "step": 24480 }, { "epoch": 6.407719986915277, "grad_norm": 0.3193889260292053, "learning_rate": 1.788838739059305e-05, "loss": 0.0283, "step": 24490 }, { "epoch": 6.410336931632319, "grad_norm": 0.3339197337627411, "learning_rate": 1.786857533179154e-05, "loss": 0.0349, "step": 24500 }, { "epoch": 6.412953876349362, "grad_norm": 0.3514746427536011, "learning_rate": 1.7848768147251555e-05, "loss": 0.034, "step": 24510 }, { "epoch": 6.415570821066405, "grad_norm": 0.29451289772987366, "learning_rate": 1.7828965850511104e-05, "loss": 0.032, "step": 24520 }, { "epoch": 6.418187765783448, "grad_norm": 0.4204190671443939, "learning_rate": 1.780916845510488e-05, "loss": 0.0345, "step": 24530 }, { "epoch": 6.420804710500491, "grad_norm": 0.42853644490242004, "learning_rate": 1.7789375974564208e-05, "loss": 0.0308, "step": 24540 }, { "epoch": 6.423421655217534, "grad_norm": 0.35799336433410645, "learning_rate": 1.7769588422417063e-05, "loss": 0.0332, "step": 24550 }, { "epoch": 6.426038599934577, "grad_norm": 0.3346477746963501, "learning_rate": 1.7749805812188063e-05, "loss": 0.0308, "step": 24560 }, { "epoch": 6.428655544651619, "grad_norm": 0.3403894901275635, "learning_rate": 1.773002815739842e-05, "loss": 0.0353, "step": 24570 }, { "epoch": 6.431272489368662, "grad_norm": 0.3275148272514343, "learning_rate": 1.771025547156598e-05, "loss": 0.0341, "step": 24580 }, { "epoch": 6.433889434085705, "grad_norm": 0.4482429623603821, "learning_rate": 1.7690487768205182e-05, "loss": 0.0345, "step": 24590 }, { "epoch": 6.436506378802748, "grad_norm": 0.27314290404319763, "learning_rate": 1.7670725060827074e-05, "loss": 0.0307, "step": 24600 }, { "epoch": 6.439123323519791, "grad_norm": 0.4241769015789032, "learning_rate": 1.7650967362939273e-05, "loss": 0.0321, "step": 24610 }, { "epoch": 6.441740268236834, "grad_norm": 0.3373008668422699, "learning_rate": 1.7631214688045984e-05, "loss": 0.0304, "step": 24620 }, { "epoch": 6.444357212953876, "grad_norm": 0.4360610842704773, "learning_rate": 1.7611467049647974e-05, "loss": 0.036, "step": 24630 }, { "epoch": 6.446974157670919, "grad_norm": 0.39723801612854004, "learning_rate": 1.7591724461242564e-05, "loss": 0.0305, "step": 24640 }, { "epoch": 6.449591102387962, "grad_norm": 0.3017699718475342, "learning_rate": 1.7571986936323626e-05, "loss": 0.0283, "step": 24650 }, { "epoch": 6.452208047105005, "grad_norm": 0.2911607623100281, "learning_rate": 1.7552254488381588e-05, "loss": 0.0337, "step": 24660 }, { "epoch": 6.454824991822048, "grad_norm": 0.39926955103874207, "learning_rate": 1.753252713090337e-05, "loss": 0.0337, "step": 24670 }, { "epoch": 6.457441936539091, "grad_norm": 0.36836519837379456, "learning_rate": 1.751280487737244e-05, "loss": 0.0306, "step": 24680 }, { "epoch": 6.460058881256133, "grad_norm": 0.3723924160003662, "learning_rate": 1.7493087741268783e-05, "loss": 0.0306, "step": 24690 }, { "epoch": 6.462675825973176, "grad_norm": 0.27300289273262024, "learning_rate": 1.7473375736068862e-05, "loss": 0.033, "step": 24700 }, { "epoch": 6.465292770690219, "grad_norm": 0.4438270628452301, "learning_rate": 1.745366887524566e-05, "loss": 0.0359, "step": 24710 }, { "epoch": 6.467909715407262, "grad_norm": 0.4966143071651459, "learning_rate": 1.7433967172268618e-05, "loss": 0.0328, "step": 24720 }, { "epoch": 6.470526660124305, "grad_norm": 0.3728248178958893, "learning_rate": 1.7414270640603674e-05, "loss": 0.0261, "step": 24730 }, { "epoch": 6.473143604841348, "grad_norm": 0.530876874923706, "learning_rate": 1.7394579293713215e-05, "loss": 0.0329, "step": 24740 }, { "epoch": 6.475760549558391, "grad_norm": 0.46062782406806946, "learning_rate": 1.7374893145056103e-05, "loss": 0.0331, "step": 24750 }, { "epoch": 6.478377494275433, "grad_norm": 0.35258668661117554, "learning_rate": 1.735521220808762e-05, "loss": 0.0312, "step": 24760 }, { "epoch": 6.480994438992476, "grad_norm": 0.2958410084247589, "learning_rate": 1.733553649625951e-05, "loss": 0.0345, "step": 24770 }, { "epoch": 6.483611383709519, "grad_norm": 0.40343642234802246, "learning_rate": 1.731586602301992e-05, "loss": 0.0372, "step": 24780 }, { "epoch": 6.486228328426562, "grad_norm": 0.29527971148490906, "learning_rate": 1.7296200801813462e-05, "loss": 0.0329, "step": 24790 }, { "epoch": 6.488845273143605, "grad_norm": 0.36426976323127747, "learning_rate": 1.72765408460811e-05, "loss": 0.0335, "step": 24800 }, { "epoch": 6.491462217860648, "grad_norm": 0.3566276729106903, "learning_rate": 1.7256886169260255e-05, "loss": 0.0318, "step": 24810 }, { "epoch": 6.49407916257769, "grad_norm": 0.4963986277580261, "learning_rate": 1.7237236784784693e-05, "loss": 0.0363, "step": 24820 }, { "epoch": 6.496696107294733, "grad_norm": 0.7406085133552551, "learning_rate": 1.721759270608459e-05, "loss": 0.0375, "step": 24830 }, { "epoch": 6.499313052011776, "grad_norm": 0.4087287485599518, "learning_rate": 1.7197953946586497e-05, "loss": 0.0301, "step": 24840 }, { "epoch": 6.501929996728819, "grad_norm": 0.3173830211162567, "learning_rate": 1.7178320519713303e-05, "loss": 0.0323, "step": 24850 }, { "epoch": 6.504546941445862, "grad_norm": 0.5561368465423584, "learning_rate": 1.7158692438884284e-05, "loss": 0.04, "step": 24860 }, { "epoch": 6.507163886162905, "grad_norm": 0.41783007979393005, "learning_rate": 1.7139069717515042e-05, "loss": 0.0353, "step": 24870 }, { "epoch": 6.509780830879947, "grad_norm": 0.6484546661376953, "learning_rate": 1.711945236901752e-05, "loss": 0.0332, "step": 24880 }, { "epoch": 6.51239777559699, "grad_norm": 0.5079058408737183, "learning_rate": 1.70998404068e-05, "loss": 0.0353, "step": 24890 }, { "epoch": 6.515014720314033, "grad_norm": 0.392955482006073, "learning_rate": 1.7080233844267066e-05, "loss": 0.0303, "step": 24900 }, { "epoch": 6.517631665031076, "grad_norm": 0.5023982524871826, "learning_rate": 1.7060632694819624e-05, "loss": 0.0392, "step": 24910 }, { "epoch": 6.520248609748119, "grad_norm": 0.27257394790649414, "learning_rate": 1.7041036971854863e-05, "loss": 0.0336, "step": 24920 }, { "epoch": 6.522865554465162, "grad_norm": 0.3066208064556122, "learning_rate": 1.702144668876629e-05, "loss": 0.0377, "step": 24930 }, { "epoch": 6.525482499182205, "grad_norm": 0.45781561732292175, "learning_rate": 1.700186185894368e-05, "loss": 0.0349, "step": 24940 }, { "epoch": 6.528099443899247, "grad_norm": 0.3222908079624176, "learning_rate": 1.6982282495773062e-05, "loss": 0.0337, "step": 24950 }, { "epoch": 6.53071638861629, "grad_norm": 0.3585456609725952, "learning_rate": 1.6962708612636753e-05, "loss": 0.0342, "step": 24960 }, { "epoch": 6.533333333333333, "grad_norm": 0.292620450258255, "learning_rate": 1.6943140222913322e-05, "loss": 0.0353, "step": 24970 }, { "epoch": 6.535950278050376, "grad_norm": 0.40674445033073425, "learning_rate": 1.6923577339977577e-05, "loss": 0.0335, "step": 24980 }, { "epoch": 6.538567222767419, "grad_norm": 0.32033807039260864, "learning_rate": 1.6904019977200564e-05, "loss": 0.034, "step": 24990 }, { "epoch": 6.541184167484462, "grad_norm": 0.29049545526504517, "learning_rate": 1.6884468147949557e-05, "loss": 0.0306, "step": 25000 }, { "epoch": 6.541184167484462, "eval_loss": 0.03778619730581603, "eval_runtime": 9.3515, "eval_samples_per_second": 109.501, "eval_steps_per_second": 1.711, "step": 25000 }, { "epoch": 6.543801112201505, "grad_norm": 0.37644854187965393, "learning_rate": 1.6864921865588047e-05, "loss": 0.0351, "step": 25010 }, { "epoch": 6.546418056918547, "grad_norm": 0.3338150978088379, "learning_rate": 1.684538114347573e-05, "loss": 0.03, "step": 25020 }, { "epoch": 6.54903500163559, "grad_norm": 0.34916117787361145, "learning_rate": 1.6825845994968516e-05, "loss": 0.0352, "step": 25030 }, { "epoch": 6.551651946352633, "grad_norm": 0.29273179173469543, "learning_rate": 1.6806316433418484e-05, "loss": 0.0348, "step": 25040 }, { "epoch": 6.554268891069676, "grad_norm": 0.3352114260196686, "learning_rate": 1.67867924721739e-05, "loss": 0.0343, "step": 25050 }, { "epoch": 6.556885835786719, "grad_norm": 0.461609423160553, "learning_rate": 1.6767274124579217e-05, "loss": 0.0334, "step": 25060 }, { "epoch": 6.559502780503762, "grad_norm": 0.5588709712028503, "learning_rate": 1.6747761403975043e-05, "loss": 0.0318, "step": 25070 }, { "epoch": 6.562119725220804, "grad_norm": 0.35401543974876404, "learning_rate": 1.6728254323698135e-05, "loss": 0.035, "step": 25080 }, { "epoch": 6.564736669937847, "grad_norm": 0.3450101315975189, "learning_rate": 1.6708752897081397e-05, "loss": 0.0342, "step": 25090 }, { "epoch": 6.56735361465489, "grad_norm": 0.22737601399421692, "learning_rate": 1.6689257137453873e-05, "loss": 0.0298, "step": 25100 }, { "epoch": 6.569970559371933, "grad_norm": 0.34681960940361023, "learning_rate": 1.6669767058140735e-05, "loss": 0.0382, "step": 25110 }, { "epoch": 6.572587504088976, "grad_norm": 0.308298796415329, "learning_rate": 1.6650282672463265e-05, "loss": 0.0318, "step": 25120 }, { "epoch": 6.575204448806019, "grad_norm": 0.34104683995246887, "learning_rate": 1.663080399373885e-05, "loss": 0.032, "step": 25130 }, { "epoch": 6.577821393523061, "grad_norm": 0.4612085819244385, "learning_rate": 1.6611331035280985e-05, "loss": 0.0368, "step": 25140 }, { "epoch": 6.580438338240104, "grad_norm": 0.4950728416442871, "learning_rate": 1.659186381039926e-05, "loss": 0.0372, "step": 25150 }, { "epoch": 6.583055282957147, "grad_norm": 0.3914588391780853, "learning_rate": 1.6572402332399333e-05, "loss": 0.0295, "step": 25160 }, { "epoch": 6.58567222767419, "grad_norm": 0.33319687843322754, "learning_rate": 1.6552946614582947e-05, "loss": 0.0322, "step": 25170 }, { "epoch": 6.588289172391233, "grad_norm": 0.29961922764778137, "learning_rate": 1.65334966702479e-05, "loss": 0.0314, "step": 25180 }, { "epoch": 6.590906117108276, "grad_norm": 0.27232062816619873, "learning_rate": 1.6514052512688044e-05, "loss": 0.0379, "step": 25190 }, { "epoch": 6.593523061825319, "grad_norm": 0.3368346393108368, "learning_rate": 1.6494614155193276e-05, "loss": 0.0336, "step": 25200 }, { "epoch": 6.596140006542361, "grad_norm": 0.44280457496643066, "learning_rate": 1.6475181611049537e-05, "loss": 0.0304, "step": 25210 }, { "epoch": 6.598756951259404, "grad_norm": 0.35077938437461853, "learning_rate": 1.645575489353879e-05, "loss": 0.0332, "step": 25220 }, { "epoch": 6.601373895976447, "grad_norm": 0.48652276396751404, "learning_rate": 1.643633401593899e-05, "loss": 0.0347, "step": 25230 }, { "epoch": 6.60399084069349, "grad_norm": 0.24766449630260468, "learning_rate": 1.6416918991524145e-05, "loss": 0.0345, "step": 25240 }, { "epoch": 6.606607785410533, "grad_norm": 0.285058856010437, "learning_rate": 1.6397509833564234e-05, "loss": 0.0341, "step": 25250 }, { "epoch": 6.609224730127576, "grad_norm": 0.2656119465827942, "learning_rate": 1.6378106555325234e-05, "loss": 0.0331, "step": 25260 }, { "epoch": 6.611841674844619, "grad_norm": 0.36539924144744873, "learning_rate": 1.635870917006911e-05, "loss": 0.0347, "step": 25270 }, { "epoch": 6.6144586195616615, "grad_norm": 0.35211730003356934, "learning_rate": 1.633931769105378e-05, "loss": 0.0284, "step": 25280 }, { "epoch": 6.6170755642787045, "grad_norm": 0.3001176416873932, "learning_rate": 1.6319932131533148e-05, "loss": 0.0326, "step": 25290 }, { "epoch": 6.6196925089957475, "grad_norm": 0.32967913150787354, "learning_rate": 1.630055250475705e-05, "loss": 0.0314, "step": 25300 }, { "epoch": 6.6223094537127905, "grad_norm": 0.26268577575683594, "learning_rate": 1.6281178823971295e-05, "loss": 0.0298, "step": 25310 }, { "epoch": 6.624926398429833, "grad_norm": 0.5058441162109375, "learning_rate": 1.6261811102417597e-05, "loss": 0.0342, "step": 25320 }, { "epoch": 6.6275433431468755, "grad_norm": 0.5294371843338013, "learning_rate": 1.6242449353333607e-05, "loss": 0.0332, "step": 25330 }, { "epoch": 6.6301602878639185, "grad_norm": 0.411773681640625, "learning_rate": 1.6223093589952903e-05, "loss": 0.0344, "step": 25340 }, { "epoch": 6.6327772325809615, "grad_norm": 0.44051438570022583, "learning_rate": 1.6203743825504974e-05, "loss": 0.0341, "step": 25350 }, { "epoch": 6.6353941772980045, "grad_norm": 0.22843913733959198, "learning_rate": 1.6184400073215194e-05, "loss": 0.0324, "step": 25360 }, { "epoch": 6.6380111220150475, "grad_norm": 0.25327709317207336, "learning_rate": 1.6165062346304845e-05, "loss": 0.0287, "step": 25370 }, { "epoch": 6.6406280667320905, "grad_norm": 0.38789933919906616, "learning_rate": 1.6145730657991066e-05, "loss": 0.0363, "step": 25380 }, { "epoch": 6.6432450114491335, "grad_norm": 0.28665658831596375, "learning_rate": 1.6126405021486897e-05, "loss": 0.0292, "step": 25390 }, { "epoch": 6.645861956166176, "grad_norm": 0.3016824424266815, "learning_rate": 1.6107085450001228e-05, "loss": 0.031, "step": 25400 }, { "epoch": 6.648478900883219, "grad_norm": 0.3922291696071625, "learning_rate": 1.608777195673879e-05, "loss": 0.0341, "step": 25410 }, { "epoch": 6.651095845600262, "grad_norm": 0.29873526096343994, "learning_rate": 1.6068464554900186e-05, "loss": 0.0357, "step": 25420 }, { "epoch": 6.653712790317305, "grad_norm": 0.3894430696964264, "learning_rate": 1.6049163257681833e-05, "loss": 0.0363, "step": 25430 }, { "epoch": 6.6563297350343476, "grad_norm": 0.20600323379039764, "learning_rate": 1.6029868078275995e-05, "loss": 0.0265, "step": 25440 }, { "epoch": 6.6589466797513905, "grad_norm": 0.3169163763523102, "learning_rate": 1.6010579029870744e-05, "loss": 0.0356, "step": 25450 }, { "epoch": 6.6615636244684335, "grad_norm": 0.3466549217700958, "learning_rate": 1.5991296125649958e-05, "loss": 0.032, "step": 25460 }, { "epoch": 6.664180569185476, "grad_norm": 0.32664406299591064, "learning_rate": 1.5972019378793328e-05, "loss": 0.0338, "step": 25470 }, { "epoch": 6.666797513902519, "grad_norm": 0.47851356863975525, "learning_rate": 1.5952748802476316e-05, "loss": 0.0343, "step": 25480 }, { "epoch": 6.669414458619562, "grad_norm": 0.39265337586402893, "learning_rate": 1.5933484409870196e-05, "loss": 0.0363, "step": 25490 }, { "epoch": 6.672031403336605, "grad_norm": 0.5032387971878052, "learning_rate": 1.5914226214141993e-05, "loss": 0.0365, "step": 25500 }, { "epoch": 6.674648348053648, "grad_norm": 0.24106982350349426, "learning_rate": 1.589497422845449e-05, "loss": 0.0306, "step": 25510 }, { "epoch": 6.677265292770691, "grad_norm": 0.5653230547904968, "learning_rate": 1.587572846596625e-05, "loss": 0.0372, "step": 25520 }, { "epoch": 6.679882237487733, "grad_norm": 0.5159065127372742, "learning_rate": 1.585648893983156e-05, "loss": 0.0324, "step": 25530 }, { "epoch": 6.682499182204776, "grad_norm": 0.36855408549308777, "learning_rate": 1.5837255663200464e-05, "loss": 0.0319, "step": 25540 }, { "epoch": 6.685116126921819, "grad_norm": 0.39614546298980713, "learning_rate": 1.581802864921873e-05, "loss": 0.0349, "step": 25550 }, { "epoch": 6.687733071638862, "grad_norm": 0.48540249466896057, "learning_rate": 1.5798807911027826e-05, "loss": 0.036, "step": 25560 }, { "epoch": 6.690350016355905, "grad_norm": 0.3857311010360718, "learning_rate": 1.577959346176496e-05, "loss": 0.0347, "step": 25570 }, { "epoch": 6.692966961072948, "grad_norm": 0.3490583598613739, "learning_rate": 1.576038531456301e-05, "loss": 0.0298, "step": 25580 }, { "epoch": 6.69558390578999, "grad_norm": 0.6179901957511902, "learning_rate": 1.5741183482550585e-05, "loss": 0.0364, "step": 25590 }, { "epoch": 6.698200850507033, "grad_norm": 0.35921305418014526, "learning_rate": 1.572198797885194e-05, "loss": 0.0344, "step": 25600 }, { "epoch": 6.700817795224076, "grad_norm": 0.3145570158958435, "learning_rate": 1.5702798816587018e-05, "loss": 0.032, "step": 25610 }, { "epoch": 6.703434739941119, "grad_norm": 0.2709348201751709, "learning_rate": 1.5683616008871444e-05, "loss": 0.0311, "step": 25620 }, { "epoch": 6.706051684658162, "grad_norm": 0.2840203046798706, "learning_rate": 1.5664439568816474e-05, "loss": 0.0297, "step": 25630 }, { "epoch": 6.708668629375205, "grad_norm": 0.28185781836509705, "learning_rate": 1.564526950952903e-05, "loss": 0.0313, "step": 25640 }, { "epoch": 6.711285574092248, "grad_norm": 0.2776778042316437, "learning_rate": 1.5626105844111676e-05, "loss": 0.0297, "step": 25650 }, { "epoch": 6.71390251880929, "grad_norm": 0.2556576132774353, "learning_rate": 1.5606948585662577e-05, "loss": 0.0309, "step": 25660 }, { "epoch": 6.716519463526333, "grad_norm": 0.26259034872055054, "learning_rate": 1.5587797747275558e-05, "loss": 0.0306, "step": 25670 }, { "epoch": 6.719136408243376, "grad_norm": 0.25451749563217163, "learning_rate": 1.5568653342040022e-05, "loss": 0.0305, "step": 25680 }, { "epoch": 6.721753352960419, "grad_norm": 0.3301449716091156, "learning_rate": 1.5549515383040993e-05, "loss": 0.037, "step": 25690 }, { "epoch": 6.724370297677462, "grad_norm": 0.31745246052742004, "learning_rate": 1.553038388335909e-05, "loss": 0.0326, "step": 25700 }, { "epoch": 6.726987242394505, "grad_norm": 0.30288368463516235, "learning_rate": 1.5511258856070504e-05, "loss": 0.0323, "step": 25710 }, { "epoch": 6.729604187111548, "grad_norm": 0.35990169644355774, "learning_rate": 1.549214031424702e-05, "loss": 0.0345, "step": 25720 }, { "epoch": 6.73222113182859, "grad_norm": 0.24277259409427643, "learning_rate": 1.5473028270955976e-05, "loss": 0.0309, "step": 25730 }, { "epoch": 6.734838076545633, "grad_norm": 0.3610764443874359, "learning_rate": 1.5453922739260275e-05, "loss": 0.0301, "step": 25740 }, { "epoch": 6.737455021262676, "grad_norm": 0.4325360357761383, "learning_rate": 1.5434823732218373e-05, "loss": 0.0341, "step": 25750 }, { "epoch": 6.740071965979719, "grad_norm": 0.23844900727272034, "learning_rate": 1.5415731262884248e-05, "loss": 0.0266, "step": 25760 }, { "epoch": 6.742688910696762, "grad_norm": 0.3218114376068115, "learning_rate": 1.539664534430744e-05, "loss": 0.0324, "step": 25770 }, { "epoch": 6.745305855413804, "grad_norm": 0.31861692667007446, "learning_rate": 1.5377565989532984e-05, "loss": 0.0318, "step": 25780 }, { "epoch": 6.747922800130847, "grad_norm": 0.4393862783908844, "learning_rate": 1.535849321160143e-05, "loss": 0.0339, "step": 25790 }, { "epoch": 6.75053974484789, "grad_norm": 0.3930702209472656, "learning_rate": 1.533942702354886e-05, "loss": 0.038, "step": 25800 }, { "epoch": 6.753156689564933, "grad_norm": 0.2467467188835144, "learning_rate": 1.5320367438406818e-05, "loss": 0.0318, "step": 25810 }, { "epoch": 6.755773634281976, "grad_norm": 0.3010796308517456, "learning_rate": 1.5301314469202365e-05, "loss": 0.0276, "step": 25820 }, { "epoch": 6.758390578999019, "grad_norm": 0.29643282294273376, "learning_rate": 1.5282268128958015e-05, "loss": 0.0315, "step": 25830 }, { "epoch": 6.761007523716062, "grad_norm": 0.2423584759235382, "learning_rate": 1.5263228430691764e-05, "loss": 0.0318, "step": 25840 }, { "epoch": 6.763624468433104, "grad_norm": 0.287006139755249, "learning_rate": 1.5244195387417076e-05, "loss": 0.0333, "step": 25850 }, { "epoch": 6.766241413150147, "grad_norm": 0.21658548712730408, "learning_rate": 1.5225169012142842e-05, "loss": 0.0295, "step": 25860 }, { "epoch": 6.76885835786719, "grad_norm": 0.24003279209136963, "learning_rate": 1.5206149317873427e-05, "loss": 0.0308, "step": 25870 }, { "epoch": 6.771475302584233, "grad_norm": 0.310829758644104, "learning_rate": 1.51871363176086e-05, "loss": 0.0327, "step": 25880 }, { "epoch": 6.774092247301276, "grad_norm": 0.5693554282188416, "learning_rate": 1.5168130024343563e-05, "loss": 0.0346, "step": 25890 }, { "epoch": 6.776709192018319, "grad_norm": 0.24450276792049408, "learning_rate": 1.5149130451068948e-05, "loss": 0.0308, "step": 25900 }, { "epoch": 6.779326136735362, "grad_norm": 0.2665019929409027, "learning_rate": 1.5130137610770783e-05, "loss": 0.0335, "step": 25910 }, { "epoch": 6.781943081452404, "grad_norm": 0.3969002068042755, "learning_rate": 1.5111151516430494e-05, "loss": 0.0292, "step": 25920 }, { "epoch": 6.784560026169447, "grad_norm": 0.34969353675842285, "learning_rate": 1.5092172181024894e-05, "loss": 0.0306, "step": 25930 }, { "epoch": 6.78717697088649, "grad_norm": 0.49384376406669617, "learning_rate": 1.5073199617526184e-05, "loss": 0.0314, "step": 25940 }, { "epoch": 6.789793915603533, "grad_norm": 0.36186710000038147, "learning_rate": 1.5054233838901932e-05, "loss": 0.0291, "step": 25950 }, { "epoch": 6.792410860320576, "grad_norm": 0.3704721927642822, "learning_rate": 1.5035274858115078e-05, "loss": 0.029, "step": 25960 }, { "epoch": 6.795027805037619, "grad_norm": 0.5105478763580322, "learning_rate": 1.5016322688123885e-05, "loss": 0.0312, "step": 25970 }, { "epoch": 6.797644749754662, "grad_norm": 0.3988720178604126, "learning_rate": 1.4997377341882e-05, "loss": 0.0338, "step": 25980 }, { "epoch": 6.800261694471704, "grad_norm": 0.3886851668357849, "learning_rate": 1.4978438832338377e-05, "loss": 0.0365, "step": 25990 }, { "epoch": 6.802878639188747, "grad_norm": 0.3836858868598938, "learning_rate": 1.4959507172437318e-05, "loss": 0.0285, "step": 26000 }, { "epoch": 6.802878639188747, "eval_loss": 0.037352538310355815, "eval_runtime": 9.4742, "eval_samples_per_second": 108.083, "eval_steps_per_second": 1.689, "step": 26000 }, { "epoch": 6.80549558390579, "grad_norm": 0.33088138699531555, "learning_rate": 1.4940582375118429e-05, "loss": 0.0278, "step": 26010 }, { "epoch": 6.808112528622833, "grad_norm": 0.29455891251564026, "learning_rate": 1.492166445331663e-05, "loss": 0.0357, "step": 26020 }, { "epoch": 6.810729473339876, "grad_norm": 0.3399808406829834, "learning_rate": 1.4902753419962146e-05, "loss": 0.0332, "step": 26030 }, { "epoch": 6.813346418056918, "grad_norm": 0.3951043486595154, "learning_rate": 1.4883849287980484e-05, "loss": 0.0316, "step": 26040 }, { "epoch": 6.815963362773961, "grad_norm": 0.3731957674026489, "learning_rate": 1.4864952070292457e-05, "loss": 0.0305, "step": 26050 }, { "epoch": 6.818580307491004, "grad_norm": 0.3230712413787842, "learning_rate": 1.4846061779814117e-05, "loss": 0.0327, "step": 26060 }, { "epoch": 6.821197252208047, "grad_norm": 0.33193638920783997, "learning_rate": 1.4827178429456798e-05, "loss": 0.0351, "step": 26070 }, { "epoch": 6.82381419692509, "grad_norm": 0.34041762351989746, "learning_rate": 1.4808302032127103e-05, "loss": 0.0293, "step": 26080 }, { "epoch": 6.826431141642133, "grad_norm": 0.4517126679420471, "learning_rate": 1.4789432600726866e-05, "loss": 0.0293, "step": 26090 }, { "epoch": 6.829048086359176, "grad_norm": 0.49828705191612244, "learning_rate": 1.4770570148153167e-05, "loss": 0.0293, "step": 26100 }, { "epoch": 6.831665031076218, "grad_norm": 0.3268827497959137, "learning_rate": 1.4751714687298313e-05, "loss": 0.0327, "step": 26110 }, { "epoch": 6.834281975793261, "grad_norm": 0.49581748247146606, "learning_rate": 1.4732866231049835e-05, "loss": 0.033, "step": 26120 }, { "epoch": 6.836898920510304, "grad_norm": 0.4515392780303955, "learning_rate": 1.471402479229047e-05, "loss": 0.0324, "step": 26130 }, { "epoch": 6.839515865227347, "grad_norm": 0.37542739510536194, "learning_rate": 1.4695190383898172e-05, "loss": 0.0303, "step": 26140 }, { "epoch": 6.84213280994439, "grad_norm": 0.39024776220321655, "learning_rate": 1.4676363018746087e-05, "loss": 0.0283, "step": 26150 }, { "epoch": 6.844749754661433, "grad_norm": 0.3718578815460205, "learning_rate": 1.4657542709702526e-05, "loss": 0.0338, "step": 26160 }, { "epoch": 6.847366699378476, "grad_norm": 0.29767003655433655, "learning_rate": 1.4638729469630996e-05, "loss": 0.0317, "step": 26170 }, { "epoch": 6.849983644095518, "grad_norm": 0.2173587679862976, "learning_rate": 1.4619923311390179e-05, "loss": 0.0288, "step": 26180 }, { "epoch": 6.852600588812561, "grad_norm": 0.24532240629196167, "learning_rate": 1.4601124247833894e-05, "loss": 0.0265, "step": 26190 }, { "epoch": 6.855217533529604, "grad_norm": 0.29330992698669434, "learning_rate": 1.4582332291811134e-05, "loss": 0.0328, "step": 26200 }, { "epoch": 6.857834478246647, "grad_norm": 0.3279236853122711, "learning_rate": 1.4563547456166017e-05, "loss": 0.0289, "step": 26210 }, { "epoch": 6.86045142296369, "grad_norm": 0.35393086075782776, "learning_rate": 1.4544769753737803e-05, "loss": 0.0311, "step": 26220 }, { "epoch": 6.863068367680732, "grad_norm": 0.49016067385673523, "learning_rate": 1.4525999197360874e-05, "loss": 0.0342, "step": 26230 }, { "epoch": 6.865685312397775, "grad_norm": 0.33892664313316345, "learning_rate": 1.450723579986474e-05, "loss": 0.0321, "step": 26240 }, { "epoch": 6.868302257114818, "grad_norm": 0.28874385356903076, "learning_rate": 1.4488479574074e-05, "loss": 0.0316, "step": 26250 }, { "epoch": 6.870919201831861, "grad_norm": 0.35149651765823364, "learning_rate": 1.4469730532808337e-05, "loss": 0.0351, "step": 26260 }, { "epoch": 6.873536146548904, "grad_norm": 0.3583768904209137, "learning_rate": 1.4450988688882563e-05, "loss": 0.0337, "step": 26270 }, { "epoch": 6.876153091265947, "grad_norm": 0.23553206026554108, "learning_rate": 1.4432254055106547e-05, "loss": 0.0326, "step": 26280 }, { "epoch": 6.87877003598299, "grad_norm": 0.4123249053955078, "learning_rate": 1.4413526644285252e-05, "loss": 0.0286, "step": 26290 }, { "epoch": 6.881386980700032, "grad_norm": 0.3820636570453644, "learning_rate": 1.4394806469218658e-05, "loss": 0.0276, "step": 26300 }, { "epoch": 6.884003925417075, "grad_norm": 0.38106122612953186, "learning_rate": 1.4376093542701841e-05, "loss": 0.0315, "step": 26310 }, { "epoch": 6.886620870134118, "grad_norm": 0.3545074462890625, "learning_rate": 1.4357387877524909e-05, "loss": 0.0371, "step": 26320 }, { "epoch": 6.889237814851161, "grad_norm": 0.39840003848075867, "learning_rate": 1.433868948647302e-05, "loss": 0.0358, "step": 26330 }, { "epoch": 6.891854759568204, "grad_norm": 0.3619614839553833, "learning_rate": 1.4319998382326327e-05, "loss": 0.0345, "step": 26340 }, { "epoch": 6.894471704285247, "grad_norm": 0.2903418242931366, "learning_rate": 1.4301314577860042e-05, "loss": 0.0341, "step": 26350 }, { "epoch": 6.89708864900229, "grad_norm": 0.27816247940063477, "learning_rate": 1.4282638085844351e-05, "loss": 0.029, "step": 26360 }, { "epoch": 6.899705593719332, "grad_norm": 0.32325780391693115, "learning_rate": 1.4263968919044472e-05, "loss": 0.0295, "step": 26370 }, { "epoch": 6.902322538436375, "grad_norm": 0.4054308235645294, "learning_rate": 1.42453070902206e-05, "loss": 0.0314, "step": 26380 }, { "epoch": 6.904939483153418, "grad_norm": 0.4908261001110077, "learning_rate": 1.4226652612127933e-05, "loss": 0.0371, "step": 26390 }, { "epoch": 6.907556427870461, "grad_norm": 0.42528581619262695, "learning_rate": 1.4208005497516608e-05, "loss": 0.0309, "step": 26400 }, { "epoch": 6.910173372587504, "grad_norm": 0.36515799164772034, "learning_rate": 1.4189365759131762e-05, "loss": 0.0362, "step": 26410 }, { "epoch": 6.912790317304547, "grad_norm": 0.35099223256111145, "learning_rate": 1.417073340971348e-05, "loss": 0.0289, "step": 26420 }, { "epoch": 6.91540726202159, "grad_norm": 0.26511016488075256, "learning_rate": 1.4152108461996811e-05, "loss": 0.0303, "step": 26430 }, { "epoch": 6.918024206738632, "grad_norm": 0.4376407265663147, "learning_rate": 1.4133490928711706e-05, "loss": 0.0319, "step": 26440 }, { "epoch": 6.920641151455675, "grad_norm": 0.31073182821273804, "learning_rate": 1.4114880822583099e-05, "loss": 0.0296, "step": 26450 }, { "epoch": 6.923258096172718, "grad_norm": 0.2978487014770508, "learning_rate": 1.40962781563308e-05, "loss": 0.0307, "step": 26460 }, { "epoch": 6.925875040889761, "grad_norm": 0.37724074721336365, "learning_rate": 1.4077682942669562e-05, "loss": 0.0288, "step": 26470 }, { "epoch": 6.928491985606804, "grad_norm": 0.31989142298698425, "learning_rate": 1.4059095194309047e-05, "loss": 0.0349, "step": 26480 }, { "epoch": 6.9311089303238465, "grad_norm": 0.2881931662559509, "learning_rate": 1.4040514923953807e-05, "loss": 0.0307, "step": 26490 }, { "epoch": 6.933725875040889, "grad_norm": 0.3236912488937378, "learning_rate": 1.4021942144303262e-05, "loss": 0.0312, "step": 26500 }, { "epoch": 6.936342819757932, "grad_norm": 0.27450498938560486, "learning_rate": 1.4003376868051748e-05, "loss": 0.0338, "step": 26510 }, { "epoch": 6.938959764474975, "grad_norm": 0.40014001727104187, "learning_rate": 1.3984819107888459e-05, "loss": 0.032, "step": 26520 }, { "epoch": 6.941576709192018, "grad_norm": 0.3042270243167877, "learning_rate": 1.3966268876497435e-05, "loss": 0.0309, "step": 26530 }, { "epoch": 6.944193653909061, "grad_norm": 0.4195344150066376, "learning_rate": 1.3947726186557592e-05, "loss": 0.0258, "step": 26540 }, { "epoch": 6.946810598626104, "grad_norm": 0.43801119923591614, "learning_rate": 1.3929191050742695e-05, "loss": 0.0303, "step": 26550 }, { "epoch": 6.9494275433431465, "grad_norm": 0.5042412281036377, "learning_rate": 1.3910663481721314e-05, "loss": 0.0346, "step": 26560 }, { "epoch": 6.9520444880601895, "grad_norm": 0.5035489797592163, "learning_rate": 1.3892143492156872e-05, "loss": 0.0355, "step": 26570 }, { "epoch": 6.9546614327772325, "grad_norm": 0.39491507411003113, "learning_rate": 1.3873631094707618e-05, "loss": 0.0369, "step": 26580 }, { "epoch": 6.9572783774942755, "grad_norm": 0.3853635787963867, "learning_rate": 1.3855126302026602e-05, "loss": 0.0326, "step": 26590 }, { "epoch": 6.9598953222113185, "grad_norm": 0.4158419370651245, "learning_rate": 1.383662912676166e-05, "loss": 0.0329, "step": 26600 }, { "epoch": 6.9625122669283614, "grad_norm": 0.36834490299224854, "learning_rate": 1.3818139581555456e-05, "loss": 0.0354, "step": 26610 }, { "epoch": 6.965129211645404, "grad_norm": 0.2967318296432495, "learning_rate": 1.3799657679045397e-05, "loss": 0.0271, "step": 26620 }, { "epoch": 6.9677461563624465, "grad_norm": 0.3893934190273285, "learning_rate": 1.3781183431863703e-05, "loss": 0.0308, "step": 26630 }, { "epoch": 6.9703631010794895, "grad_norm": 0.5401083827018738, "learning_rate": 1.3762716852637348e-05, "loss": 0.0384, "step": 26640 }, { "epoch": 6.9729800457965325, "grad_norm": 0.5646620988845825, "learning_rate": 1.374425795398807e-05, "loss": 0.0312, "step": 26650 }, { "epoch": 6.9755969905135755, "grad_norm": 0.31603124737739563, "learning_rate": 1.3725806748532338e-05, "loss": 0.0317, "step": 26660 }, { "epoch": 6.9782139352306185, "grad_norm": 0.42641425132751465, "learning_rate": 1.3707363248881383e-05, "loss": 0.0321, "step": 26670 }, { "epoch": 6.9808308799476615, "grad_norm": 0.5660805106163025, "learning_rate": 1.368892746764116e-05, "loss": 0.0321, "step": 26680 }, { "epoch": 6.983447824664704, "grad_norm": 0.4165215492248535, "learning_rate": 1.3670499417412373e-05, "loss": 0.0285, "step": 26690 }, { "epoch": 6.986064769381747, "grad_norm": 0.4457850754261017, "learning_rate": 1.3652079110790388e-05, "loss": 0.029, "step": 26700 }, { "epoch": 6.98868171409879, "grad_norm": 0.32575851678848267, "learning_rate": 1.3633666560365337e-05, "loss": 0.0289, "step": 26710 }, { "epoch": 6.991298658815833, "grad_norm": 0.24893328547477722, "learning_rate": 1.3615261778722008e-05, "loss": 0.0331, "step": 26720 }, { "epoch": 6.993915603532876, "grad_norm": 0.2833508551120758, "learning_rate": 1.3596864778439899e-05, "loss": 0.03, "step": 26730 }, { "epoch": 6.9965325482499185, "grad_norm": 0.4072076380252838, "learning_rate": 1.3578475572093185e-05, "loss": 0.0344, "step": 26740 }, { "epoch": 6.999149492966961, "grad_norm": 0.30101343989372253, "learning_rate": 1.3560094172250737e-05, "loss": 0.0324, "step": 26750 }, { "epoch": 7.001570166830226, "grad_norm": 0.3735860586166382, "learning_rate": 1.3541720591476033e-05, "loss": 0.0292, "step": 26760 }, { "epoch": 7.004187111547268, "grad_norm": 0.3974541425704956, "learning_rate": 1.3523354842327263e-05, "loss": 0.0309, "step": 26770 }, { "epoch": 7.006804056264311, "grad_norm": 0.32980749011039734, "learning_rate": 1.350499693735724e-05, "loss": 0.0335, "step": 26780 }, { "epoch": 7.009421000981354, "grad_norm": 0.3228886127471924, "learning_rate": 1.3486646889113427e-05, "loss": 0.032, "step": 26790 }, { "epoch": 7.012037945698397, "grad_norm": 0.2811286449432373, "learning_rate": 1.3468304710137902e-05, "loss": 0.0305, "step": 26800 }, { "epoch": 7.01465489041544, "grad_norm": 0.4297829270362854, "learning_rate": 1.344997041296736e-05, "loss": 0.031, "step": 26810 }, { "epoch": 7.017271835132483, "grad_norm": 0.3122144937515259, "learning_rate": 1.3431644010133132e-05, "loss": 0.032, "step": 26820 }, { "epoch": 7.019888779849525, "grad_norm": 0.4094059467315674, "learning_rate": 1.341332551416114e-05, "loss": 0.03, "step": 26830 }, { "epoch": 7.022505724566568, "grad_norm": 0.29214930534362793, "learning_rate": 1.33950149375719e-05, "loss": 0.0286, "step": 26840 }, { "epoch": 7.025122669283611, "grad_norm": 0.2737705111503601, "learning_rate": 1.3376712292880533e-05, "loss": 0.0274, "step": 26850 }, { "epoch": 7.027739614000654, "grad_norm": 0.3263177275657654, "learning_rate": 1.3358417592596705e-05, "loss": 0.0364, "step": 26860 }, { "epoch": 7.030356558717697, "grad_norm": 0.5150085091590881, "learning_rate": 1.334013084922468e-05, "loss": 0.0309, "step": 26870 }, { "epoch": 7.03297350343474, "grad_norm": 0.35235485434532166, "learning_rate": 1.3321852075263269e-05, "loss": 0.0258, "step": 26880 }, { "epoch": 7.035590448151782, "grad_norm": 0.26451367139816284, "learning_rate": 1.3303581283205858e-05, "loss": 0.0335, "step": 26890 }, { "epoch": 7.038207392868825, "grad_norm": 0.20942290127277374, "learning_rate": 1.3285318485540348e-05, "loss": 0.0241, "step": 26900 }, { "epoch": 7.040824337585868, "grad_norm": 0.24867643415927887, "learning_rate": 1.3267063694749182e-05, "loss": 0.0305, "step": 26910 }, { "epoch": 7.043441282302911, "grad_norm": 0.3994474411010742, "learning_rate": 1.3248816923309348e-05, "loss": 0.0256, "step": 26920 }, { "epoch": 7.046058227019954, "grad_norm": 0.2341812402009964, "learning_rate": 1.3230578183692339e-05, "loss": 0.0266, "step": 26930 }, { "epoch": 7.048675171736997, "grad_norm": 0.3208339512348175, "learning_rate": 1.3212347488364158e-05, "loss": 0.0302, "step": 26940 }, { "epoch": 7.05129211645404, "grad_norm": 0.28134050965309143, "learning_rate": 1.3194124849785334e-05, "loss": 0.0284, "step": 26950 }, { "epoch": 7.053909061171082, "grad_norm": 0.20203755795955658, "learning_rate": 1.3175910280410836e-05, "loss": 0.0292, "step": 26960 }, { "epoch": 7.056526005888125, "grad_norm": 0.2081657201051712, "learning_rate": 1.315770379269017e-05, "loss": 0.0287, "step": 26970 }, { "epoch": 7.059142950605168, "grad_norm": 0.37536320090293884, "learning_rate": 1.3139505399067298e-05, "loss": 0.0296, "step": 26980 }, { "epoch": 7.061759895322211, "grad_norm": 0.351087749004364, "learning_rate": 1.3121315111980653e-05, "loss": 0.0306, "step": 26990 }, { "epoch": 7.064376840039254, "grad_norm": 0.5465127229690552, "learning_rate": 1.3103132943863122e-05, "loss": 0.0336, "step": 27000 }, { "epoch": 7.064376840039254, "eval_loss": 0.03515085224030179, "eval_runtime": 9.687, "eval_samples_per_second": 105.709, "eval_steps_per_second": 1.652, "step": 27000 }, { "epoch": 7.066993784756297, "grad_norm": 0.19655290246009827, "learning_rate": 1.3084958907142033e-05, "loss": 0.0257, "step": 27010 }, { "epoch": 7.0696107294733395, "grad_norm": 0.4432132840156555, "learning_rate": 1.3066793014239182e-05, "loss": 0.0298, "step": 27020 }, { "epoch": 7.0722276741903825, "grad_norm": 0.2571084499359131, "learning_rate": 1.3048635277570776e-05, "loss": 0.0315, "step": 27030 }, { "epoch": 7.074844618907425, "grad_norm": 0.2586584687232971, "learning_rate": 1.303048570954747e-05, "loss": 0.0316, "step": 27040 }, { "epoch": 7.077461563624468, "grad_norm": 0.29508477449417114, "learning_rate": 1.3012344322574322e-05, "loss": 0.0291, "step": 27050 }, { "epoch": 7.080078508341511, "grad_norm": 0.3852759599685669, "learning_rate": 1.2994211129050782e-05, "loss": 0.0302, "step": 27060 }, { "epoch": 7.082695453058554, "grad_norm": 0.30965447425842285, "learning_rate": 1.2976086141370727e-05, "loss": 0.0326, "step": 27070 }, { "epoch": 7.085312397775597, "grad_norm": 0.5237151384353638, "learning_rate": 1.2957969371922427e-05, "loss": 0.0346, "step": 27080 }, { "epoch": 7.0879293424926395, "grad_norm": 0.27156862616539, "learning_rate": 1.2939860833088501e-05, "loss": 0.029, "step": 27090 }, { "epoch": 7.0905462872096825, "grad_norm": 0.29252901673316956, "learning_rate": 1.2921760537245986e-05, "loss": 0.0294, "step": 27100 }, { "epoch": 7.0931632319267255, "grad_norm": 0.35140424966812134, "learning_rate": 1.2903668496766244e-05, "loss": 0.0283, "step": 27110 }, { "epoch": 7.0957801766437685, "grad_norm": 0.34542346000671387, "learning_rate": 1.288558472401502e-05, "loss": 0.0303, "step": 27120 }, { "epoch": 7.0983971213608115, "grad_norm": 0.2882545590400696, "learning_rate": 1.2867509231352409e-05, "loss": 0.0302, "step": 27130 }, { "epoch": 7.1010140660778545, "grad_norm": 0.28216639161109924, "learning_rate": 1.2849442031132832e-05, "loss": 0.0281, "step": 27140 }, { "epoch": 7.103631010794897, "grad_norm": 0.2185831218957901, "learning_rate": 1.2831383135705067e-05, "loss": 0.0257, "step": 27150 }, { "epoch": 7.1062479555119396, "grad_norm": 0.2274857759475708, "learning_rate": 1.2813332557412171e-05, "loss": 0.0278, "step": 27160 }, { "epoch": 7.1088649002289825, "grad_norm": 0.22652925550937653, "learning_rate": 1.2795290308591574e-05, "loss": 0.0253, "step": 27170 }, { "epoch": 7.1114818449460255, "grad_norm": 0.3402152955532074, "learning_rate": 1.2777256401574956e-05, "loss": 0.0326, "step": 27180 }, { "epoch": 7.1140987896630685, "grad_norm": 0.43154576420783997, "learning_rate": 1.2759230848688331e-05, "loss": 0.0322, "step": 27190 }, { "epoch": 7.1167157343801115, "grad_norm": 0.4719350039958954, "learning_rate": 1.274121366225201e-05, "loss": 0.028, "step": 27200 }, { "epoch": 7.1193326790971545, "grad_norm": 0.30630409717559814, "learning_rate": 1.2723204854580548e-05, "loss": 0.029, "step": 27210 }, { "epoch": 7.121949623814197, "grad_norm": 0.39883729815483093, "learning_rate": 1.2705204437982805e-05, "loss": 0.0306, "step": 27220 }, { "epoch": 7.12456656853124, "grad_norm": 0.2656046748161316, "learning_rate": 1.268721242476189e-05, "loss": 0.0295, "step": 27230 }, { "epoch": 7.127183513248283, "grad_norm": 0.32729870080947876, "learning_rate": 1.2669228827215186e-05, "loss": 0.0284, "step": 27240 }, { "epoch": 7.129800457965326, "grad_norm": 0.3247520923614502, "learning_rate": 1.2651253657634315e-05, "loss": 0.0282, "step": 27250 }, { "epoch": 7.132417402682369, "grad_norm": 0.4259008467197418, "learning_rate": 1.2633286928305127e-05, "loss": 0.0338, "step": 27260 }, { "epoch": 7.135034347399412, "grad_norm": 0.2542956471443176, "learning_rate": 1.2615328651507701e-05, "loss": 0.029, "step": 27270 }, { "epoch": 7.137651292116454, "grad_norm": 0.26459887623786926, "learning_rate": 1.2597378839516364e-05, "loss": 0.0275, "step": 27280 }, { "epoch": 7.140268236833497, "grad_norm": 0.3292798399925232, "learning_rate": 1.2579437504599639e-05, "loss": 0.0319, "step": 27290 }, { "epoch": 7.14288518155054, "grad_norm": 0.35263168811798096, "learning_rate": 1.2561504659020269e-05, "loss": 0.0307, "step": 27300 }, { "epoch": 7.145502126267583, "grad_norm": 0.2072511911392212, "learning_rate": 1.254358031503517e-05, "loss": 0.0252, "step": 27310 }, { "epoch": 7.148119070984626, "grad_norm": 0.39903518557548523, "learning_rate": 1.2525664484895467e-05, "loss": 0.0312, "step": 27320 }, { "epoch": 7.150736015701669, "grad_norm": 0.33573272824287415, "learning_rate": 1.250775718084646e-05, "loss": 0.029, "step": 27330 }, { "epoch": 7.153352960418712, "grad_norm": 0.2959030568599701, "learning_rate": 1.2489858415127628e-05, "loss": 0.0279, "step": 27340 }, { "epoch": 7.155969905135754, "grad_norm": 0.26307299733161926, "learning_rate": 1.2471968199972616e-05, "loss": 0.0314, "step": 27350 }, { "epoch": 7.158586849852797, "grad_norm": 0.26677271723747253, "learning_rate": 1.2454086547609206e-05, "loss": 0.0266, "step": 27360 }, { "epoch": 7.16120379456984, "grad_norm": 0.3760434687137604, "learning_rate": 1.2436213470259331e-05, "loss": 0.0316, "step": 27370 }, { "epoch": 7.163820739286883, "grad_norm": 0.2940519452095032, "learning_rate": 1.2418348980139078e-05, "loss": 0.0253, "step": 27380 }, { "epoch": 7.166437684003926, "grad_norm": 0.33235180377960205, "learning_rate": 1.240049308945866e-05, "loss": 0.0274, "step": 27390 }, { "epoch": 7.169054628720969, "grad_norm": 0.31940221786499023, "learning_rate": 1.2382645810422418e-05, "loss": 0.0245, "step": 27400 }, { "epoch": 7.171671573438011, "grad_norm": 0.20587033033370972, "learning_rate": 1.236480715522878e-05, "loss": 0.0311, "step": 27410 }, { "epoch": 7.174288518155054, "grad_norm": 0.2504628896713257, "learning_rate": 1.2346977136070311e-05, "loss": 0.0296, "step": 27420 }, { "epoch": 7.176905462872097, "grad_norm": 0.23364555835723877, "learning_rate": 1.2329155765133658e-05, "loss": 0.0282, "step": 27430 }, { "epoch": 7.17952240758914, "grad_norm": 0.23926670849323273, "learning_rate": 1.2311343054599562e-05, "loss": 0.0315, "step": 27440 }, { "epoch": 7.182139352306183, "grad_norm": 0.4970473349094391, "learning_rate": 1.2293539016642847e-05, "loss": 0.0359, "step": 27450 }, { "epoch": 7.184756297023226, "grad_norm": 0.3282844126224518, "learning_rate": 1.22757436634324e-05, "loss": 0.0317, "step": 27460 }, { "epoch": 7.187373241740268, "grad_norm": 0.27741098403930664, "learning_rate": 1.2257957007131168e-05, "loss": 0.0339, "step": 27470 }, { "epoch": 7.189990186457311, "grad_norm": 0.3691469728946686, "learning_rate": 1.2240179059896172e-05, "loss": 0.0267, "step": 27480 }, { "epoch": 7.192607131174354, "grad_norm": 0.3223341107368469, "learning_rate": 1.2222409833878471e-05, "loss": 0.0323, "step": 27490 }, { "epoch": 7.195224075891397, "grad_norm": 0.37310001254081726, "learning_rate": 1.2204649341223173e-05, "loss": 0.0247, "step": 27500 }, { "epoch": 7.19784102060844, "grad_norm": 0.3943912386894226, "learning_rate": 1.2186897594069385e-05, "loss": 0.0292, "step": 27510 }, { "epoch": 7.200457965325483, "grad_norm": 0.4825931191444397, "learning_rate": 1.2169154604550274e-05, "loss": 0.0309, "step": 27520 }, { "epoch": 7.203074910042526, "grad_norm": 0.4375806450843811, "learning_rate": 1.2151420384793002e-05, "loss": 0.0304, "step": 27530 }, { "epoch": 7.205691854759568, "grad_norm": 0.44367581605911255, "learning_rate": 1.213369494691875e-05, "loss": 0.0283, "step": 27540 }, { "epoch": 7.208308799476611, "grad_norm": 0.28335586190223694, "learning_rate": 1.2115978303042671e-05, "loss": 0.0303, "step": 27550 }, { "epoch": 7.210925744193654, "grad_norm": 0.3478769361972809, "learning_rate": 1.2098270465273945e-05, "loss": 0.0269, "step": 27560 }, { "epoch": 7.213542688910697, "grad_norm": 0.1998811960220337, "learning_rate": 1.2080571445715687e-05, "loss": 0.0342, "step": 27570 }, { "epoch": 7.21615963362774, "grad_norm": 0.32851898670196533, "learning_rate": 1.2062881256465024e-05, "loss": 0.0259, "step": 27580 }, { "epoch": 7.218776578344783, "grad_norm": 0.24480850994586945, "learning_rate": 1.2045199909613034e-05, "loss": 0.027, "step": 27590 }, { "epoch": 7.221393523061825, "grad_norm": 0.34246158599853516, "learning_rate": 1.2027527417244757e-05, "loss": 0.0294, "step": 27600 }, { "epoch": 7.224010467778868, "grad_norm": 0.2054915428161621, "learning_rate": 1.200986379143916e-05, "loss": 0.0281, "step": 27610 }, { "epoch": 7.226627412495911, "grad_norm": 0.38791176676750183, "learning_rate": 1.199220904426917e-05, "loss": 0.0301, "step": 27620 }, { "epoch": 7.229244357212954, "grad_norm": 0.3540157973766327, "learning_rate": 1.1974563187801644e-05, "loss": 0.0309, "step": 27630 }, { "epoch": 7.231861301929997, "grad_norm": 0.3500435948371887, "learning_rate": 1.1956926234097362e-05, "loss": 0.0325, "step": 27640 }, { "epoch": 7.23447824664704, "grad_norm": 0.3816050887107849, "learning_rate": 1.1939298195211005e-05, "loss": 0.0315, "step": 27650 }, { "epoch": 7.237095191364083, "grad_norm": 0.3657297194004059, "learning_rate": 1.1921679083191184e-05, "loss": 0.0279, "step": 27660 }, { "epoch": 7.239712136081125, "grad_norm": 0.3235689401626587, "learning_rate": 1.1904068910080379e-05, "loss": 0.032, "step": 27670 }, { "epoch": 7.242329080798168, "grad_norm": 0.3269582986831665, "learning_rate": 1.1886467687914988e-05, "loss": 0.0281, "step": 27680 }, { "epoch": 7.244946025515211, "grad_norm": 0.3353860080242157, "learning_rate": 1.1868875428725276e-05, "loss": 0.033, "step": 27690 }, { "epoch": 7.247562970232254, "grad_norm": 0.3661133348941803, "learning_rate": 1.1851292144535403e-05, "loss": 0.0267, "step": 27700 }, { "epoch": 7.250179914949297, "grad_norm": 0.37683379650115967, "learning_rate": 1.183371784736335e-05, "loss": 0.0265, "step": 27710 }, { "epoch": 7.25279685966634, "grad_norm": 0.2772020995616913, "learning_rate": 1.1816152549221002e-05, "loss": 0.036, "step": 27720 }, { "epoch": 7.255413804383382, "grad_norm": 0.42098507285118103, "learning_rate": 1.1798596262114078e-05, "loss": 0.0357, "step": 27730 }, { "epoch": 7.258030749100425, "grad_norm": 0.3395950198173523, "learning_rate": 1.178104899804212e-05, "loss": 0.0275, "step": 27740 }, { "epoch": 7.260647693817468, "grad_norm": 0.37505412101745605, "learning_rate": 1.176351076899852e-05, "loss": 0.0303, "step": 27750 }, { "epoch": 7.263264638534511, "grad_norm": 0.25512251257896423, "learning_rate": 1.1745981586970509e-05, "loss": 0.0305, "step": 27760 }, { "epoch": 7.265881583251554, "grad_norm": 0.3966490924358368, "learning_rate": 1.1728461463939098e-05, "loss": 0.0271, "step": 27770 }, { "epoch": 7.268498527968597, "grad_norm": 0.36627981066703796, "learning_rate": 1.1710950411879129e-05, "loss": 0.0277, "step": 27780 }, { "epoch": 7.27111547268564, "grad_norm": 0.3093130588531494, "learning_rate": 1.1693448442759248e-05, "loss": 0.0308, "step": 27790 }, { "epoch": 7.273732417402682, "grad_norm": 0.36231744289398193, "learning_rate": 1.167595556854189e-05, "loss": 0.033, "step": 27800 }, { "epoch": 7.276349362119725, "grad_norm": 0.2014455944299698, "learning_rate": 1.1658471801183255e-05, "loss": 0.0274, "step": 27810 }, { "epoch": 7.278966306836768, "grad_norm": 0.3836444020271301, "learning_rate": 1.1640997152633351e-05, "loss": 0.0309, "step": 27820 }, { "epoch": 7.281583251553811, "grad_norm": 0.2450692355632782, "learning_rate": 1.1623531634835913e-05, "loss": 0.0341, "step": 27830 }, { "epoch": 7.284200196270854, "grad_norm": 0.23850251734256744, "learning_rate": 1.1606075259728474e-05, "loss": 0.0237, "step": 27840 }, { "epoch": 7.286817140987897, "grad_norm": 0.3725256025791168, "learning_rate": 1.1588628039242294e-05, "loss": 0.0262, "step": 27850 }, { "epoch": 7.289434085704939, "grad_norm": 0.31703150272369385, "learning_rate": 1.1571189985302399e-05, "loss": 0.025, "step": 27860 }, { "epoch": 7.292051030421982, "grad_norm": 0.3323570191860199, "learning_rate": 1.1553761109827513e-05, "loss": 0.029, "step": 27870 }, { "epoch": 7.294667975139025, "grad_norm": 0.30076855421066284, "learning_rate": 1.1536341424730118e-05, "loss": 0.0299, "step": 27880 }, { "epoch": 7.297284919856068, "grad_norm": 0.3726050853729248, "learning_rate": 1.1518930941916405e-05, "loss": 0.0264, "step": 27890 }, { "epoch": 7.299901864573111, "grad_norm": 0.30087316036224365, "learning_rate": 1.1501529673286286e-05, "loss": 0.0363, "step": 27900 }, { "epoch": 7.302518809290154, "grad_norm": 0.2716040313243866, "learning_rate": 1.1484137630733338e-05, "loss": 0.0288, "step": 27910 }, { "epoch": 7.305135754007196, "grad_norm": 0.28562942147254944, "learning_rate": 1.1466754826144885e-05, "loss": 0.0342, "step": 27920 }, { "epoch": 7.307752698724239, "grad_norm": 0.26102012395858765, "learning_rate": 1.1449381271401888e-05, "loss": 0.0295, "step": 27930 }, { "epoch": 7.310369643441282, "grad_norm": 0.2839173972606659, "learning_rate": 1.1432016978379015e-05, "loss": 0.0266, "step": 27940 }, { "epoch": 7.312986588158325, "grad_norm": 0.2480892837047577, "learning_rate": 1.14146619589446e-05, "loss": 0.0292, "step": 27950 }, { "epoch": 7.315603532875368, "grad_norm": 0.26429545879364014, "learning_rate": 1.1397316224960643e-05, "loss": 0.0292, "step": 27960 }, { "epoch": 7.318220477592411, "grad_norm": 0.38657790422439575, "learning_rate": 1.1379979788282775e-05, "loss": 0.0299, "step": 27970 }, { "epoch": 7.320837422309454, "grad_norm": 0.34630465507507324, "learning_rate": 1.136265266076029e-05, "loss": 0.0272, "step": 27980 }, { "epoch": 7.323454367026496, "grad_norm": 0.39583563804626465, "learning_rate": 1.1345334854236116e-05, "loss": 0.0269, "step": 27990 }, { "epoch": 7.326071311743539, "grad_norm": 0.30565205216407776, "learning_rate": 1.1328026380546828e-05, "loss": 0.0312, "step": 28000 }, { "epoch": 7.326071311743539, "eval_loss": 0.03350659035862492, "eval_runtime": 9.5497, "eval_samples_per_second": 107.228, "eval_steps_per_second": 1.675, "step": 28000 }, { "epoch": 7.328688256460582, "grad_norm": 0.4849902093410492, "learning_rate": 1.1310727251522585e-05, "loss": 0.0315, "step": 28010 }, { "epoch": 7.331305201177625, "grad_norm": 0.3135029375553131, "learning_rate": 1.1293437478987176e-05, "loss": 0.0295, "step": 28020 }, { "epoch": 7.333922145894668, "grad_norm": 0.25514817237854004, "learning_rate": 1.1276157074758006e-05, "loss": 0.0277, "step": 28030 }, { "epoch": 7.336539090611711, "grad_norm": 0.31504568457603455, "learning_rate": 1.1258886050646067e-05, "loss": 0.0272, "step": 28040 }, { "epoch": 7.339156035328753, "grad_norm": 0.2858773469924927, "learning_rate": 1.124162441845594e-05, "loss": 0.0293, "step": 28050 }, { "epoch": 7.341772980045796, "grad_norm": 0.29029589891433716, "learning_rate": 1.12243721899858e-05, "loss": 0.0356, "step": 28060 }, { "epoch": 7.344389924762839, "grad_norm": 0.3983522951602936, "learning_rate": 1.120712937702736e-05, "loss": 0.0287, "step": 28070 }, { "epoch": 7.347006869479882, "grad_norm": 0.430837482213974, "learning_rate": 1.1189895991365934e-05, "loss": 0.0291, "step": 28080 }, { "epoch": 7.349623814196925, "grad_norm": 0.37221601605415344, "learning_rate": 1.1172672044780378e-05, "loss": 0.0285, "step": 28090 }, { "epoch": 7.352240758913968, "grad_norm": 0.29085683822631836, "learning_rate": 1.1155457549043103e-05, "loss": 0.0345, "step": 28100 }, { "epoch": 7.35485770363101, "grad_norm": 0.3261438012123108, "learning_rate": 1.1138252515920045e-05, "loss": 0.0299, "step": 28110 }, { "epoch": 7.357474648348053, "grad_norm": 0.2531701922416687, "learning_rate": 1.1121056957170679e-05, "loss": 0.0287, "step": 28120 }, { "epoch": 7.360091593065096, "grad_norm": 0.39535531401634216, "learning_rate": 1.110387088454801e-05, "loss": 0.0308, "step": 28130 }, { "epoch": 7.362708537782139, "grad_norm": 0.18832387030124664, "learning_rate": 1.1086694309798557e-05, "loss": 0.0312, "step": 28140 }, { "epoch": 7.365325482499182, "grad_norm": 0.2911202013492584, "learning_rate": 1.106952724466235e-05, "loss": 0.0259, "step": 28150 }, { "epoch": 7.367942427216225, "grad_norm": 0.4606091380119324, "learning_rate": 1.1052369700872924e-05, "loss": 0.0321, "step": 28160 }, { "epoch": 7.370559371933268, "grad_norm": 0.3079313039779663, "learning_rate": 1.103522169015728e-05, "loss": 0.0283, "step": 28170 }, { "epoch": 7.37317631665031, "grad_norm": 0.3586598336696625, "learning_rate": 1.101808322423593e-05, "loss": 0.0303, "step": 28180 }, { "epoch": 7.375793261367353, "grad_norm": 0.497734934091568, "learning_rate": 1.1000954314822856e-05, "loss": 0.0278, "step": 28190 }, { "epoch": 7.378410206084396, "grad_norm": 0.2749795615673065, "learning_rate": 1.0983834973625512e-05, "loss": 0.0274, "step": 28200 }, { "epoch": 7.381027150801439, "grad_norm": 0.4311588704586029, "learning_rate": 1.0966725212344791e-05, "loss": 0.0235, "step": 28210 }, { "epoch": 7.383644095518482, "grad_norm": 0.23907941579818726, "learning_rate": 1.0949625042675071e-05, "loss": 0.0263, "step": 28220 }, { "epoch": 7.386261040235525, "grad_norm": 0.32718607783317566, "learning_rate": 1.0932534476304138e-05, "loss": 0.0264, "step": 28230 }, { "epoch": 7.388877984952568, "grad_norm": 0.260172963142395, "learning_rate": 1.0915453524913243e-05, "loss": 0.0257, "step": 28240 }, { "epoch": 7.39149492966961, "grad_norm": 0.4206393361091614, "learning_rate": 1.0898382200177055e-05, "loss": 0.0328, "step": 28250 }, { "epoch": 7.394111874386653, "grad_norm": 0.43060967326164246, "learning_rate": 1.0881320513763668e-05, "loss": 0.0303, "step": 28260 }, { "epoch": 7.396728819103696, "grad_norm": 0.575630784034729, "learning_rate": 1.0864268477334571e-05, "loss": 0.0377, "step": 28270 }, { "epoch": 7.399345763820739, "grad_norm": 0.3237912356853485, "learning_rate": 1.084722610254468e-05, "loss": 0.0278, "step": 28280 }, { "epoch": 7.401962708537782, "grad_norm": 0.25009626150131226, "learning_rate": 1.0830193401042305e-05, "loss": 0.028, "step": 28290 }, { "epoch": 7.404579653254825, "grad_norm": 0.2242855280637741, "learning_rate": 1.0813170384469115e-05, "loss": 0.0248, "step": 28300 }, { "epoch": 7.4071965979718675, "grad_norm": 0.3656257390975952, "learning_rate": 1.0796157064460194e-05, "loss": 0.0301, "step": 28310 }, { "epoch": 7.4098135426889105, "grad_norm": 0.4277788996696472, "learning_rate": 1.0779153452643995e-05, "loss": 0.0304, "step": 28320 }, { "epoch": 7.4124304874059534, "grad_norm": 0.31005632877349854, "learning_rate": 1.0762159560642307e-05, "loss": 0.0265, "step": 28330 }, { "epoch": 7.415047432122996, "grad_norm": 0.3337995409965515, "learning_rate": 1.0745175400070303e-05, "loss": 0.0306, "step": 28340 }, { "epoch": 7.417664376840039, "grad_norm": 0.47239336371421814, "learning_rate": 1.0728200982536501e-05, "loss": 0.0272, "step": 28350 }, { "epoch": 7.420281321557082, "grad_norm": 0.3395785689353943, "learning_rate": 1.0711236319642762e-05, "loss": 0.0278, "step": 28360 }, { "epoch": 7.4228982662741245, "grad_norm": 0.4349222779273987, "learning_rate": 1.069428142298425e-05, "loss": 0.029, "step": 28370 }, { "epoch": 7.4255152109911675, "grad_norm": 0.3717556297779083, "learning_rate": 1.0677336304149505e-05, "loss": 0.025, "step": 28380 }, { "epoch": 7.4281321557082105, "grad_norm": 0.17544744908809662, "learning_rate": 1.0660400974720327e-05, "loss": 0.0312, "step": 28390 }, { "epoch": 7.4307491004252535, "grad_norm": 0.21056394279003143, "learning_rate": 1.0643475446271872e-05, "loss": 0.0272, "step": 28400 }, { "epoch": 7.4333660451422965, "grad_norm": 0.38550424575805664, "learning_rate": 1.0626559730372575e-05, "loss": 0.0261, "step": 28410 }, { "epoch": 7.4359829898593395, "grad_norm": 0.33460870385169983, "learning_rate": 1.0609653838584177e-05, "loss": 0.0241, "step": 28420 }, { "epoch": 7.4385999345763825, "grad_norm": 0.2785506546497345, "learning_rate": 1.059275778246168e-05, "loss": 0.0254, "step": 28430 }, { "epoch": 7.441216879293425, "grad_norm": 0.3150511682033539, "learning_rate": 1.0575871573553387e-05, "loss": 0.0236, "step": 28440 }, { "epoch": 7.443833824010468, "grad_norm": 0.21259582042694092, "learning_rate": 1.055899522340086e-05, "loss": 0.0271, "step": 28450 }, { "epoch": 7.4464507687275105, "grad_norm": 0.2286917269229889, "learning_rate": 1.0542128743538937e-05, "loss": 0.0268, "step": 28460 }, { "epoch": 7.4490677134445535, "grad_norm": 0.4267086684703827, "learning_rate": 1.0525272145495679e-05, "loss": 0.0294, "step": 28470 }, { "epoch": 7.4516846581615965, "grad_norm": 0.3371066749095917, "learning_rate": 1.050842544079243e-05, "loss": 0.0278, "step": 28480 }, { "epoch": 7.4543016028786395, "grad_norm": 0.1786905974149704, "learning_rate": 1.0491588640943736e-05, "loss": 0.0283, "step": 28490 }, { "epoch": 7.4569185475956825, "grad_norm": 0.35492804646492004, "learning_rate": 1.0474761757457399e-05, "loss": 0.028, "step": 28500 }, { "epoch": 7.459535492312725, "grad_norm": 0.3589496910572052, "learning_rate": 1.0457944801834438e-05, "loss": 0.0322, "step": 28510 }, { "epoch": 7.462152437029768, "grad_norm": 0.22844111919403076, "learning_rate": 1.0441137785569088e-05, "loss": 0.0292, "step": 28520 }, { "epoch": 7.464769381746811, "grad_norm": 0.26975587010383606, "learning_rate": 1.0424340720148773e-05, "loss": 0.0227, "step": 28530 }, { "epoch": 7.467386326463854, "grad_norm": 0.8781476020812988, "learning_rate": 1.0407553617054135e-05, "loss": 0.0237, "step": 28540 }, { "epoch": 7.470003271180897, "grad_norm": 0.2750764787197113, "learning_rate": 1.0390776487759001e-05, "loss": 0.0287, "step": 28550 }, { "epoch": 7.47262021589794, "grad_norm": 0.44443485140800476, "learning_rate": 1.037400934373039e-05, "loss": 0.025, "step": 28560 }, { "epoch": 7.475237160614982, "grad_norm": 0.3391941785812378, "learning_rate": 1.0357252196428477e-05, "loss": 0.0294, "step": 28570 }, { "epoch": 7.477854105332025, "grad_norm": 0.3009742200374603, "learning_rate": 1.0340505057306608e-05, "loss": 0.0301, "step": 28580 }, { "epoch": 7.480471050049068, "grad_norm": 0.36080223321914673, "learning_rate": 1.0323767937811299e-05, "loss": 0.0294, "step": 28590 }, { "epoch": 7.483087994766111, "grad_norm": 0.4406573474407196, "learning_rate": 1.0307040849382216e-05, "loss": 0.0301, "step": 28600 }, { "epoch": 7.485704939483154, "grad_norm": 0.2862299978733063, "learning_rate": 1.0290323803452167e-05, "loss": 0.0248, "step": 28610 }, { "epoch": 7.488321884200197, "grad_norm": 0.4238396883010864, "learning_rate": 1.0273616811447104e-05, "loss": 0.0256, "step": 28620 }, { "epoch": 7.490938828917239, "grad_norm": 0.3760383129119873, "learning_rate": 1.0256919884786078e-05, "loss": 0.0296, "step": 28630 }, { "epoch": 7.493555773634282, "grad_norm": 0.26118260622024536, "learning_rate": 1.0240233034881292e-05, "loss": 0.027, "step": 28640 }, { "epoch": 7.496172718351325, "grad_norm": 0.39909711480140686, "learning_rate": 1.0223556273138052e-05, "loss": 0.0288, "step": 28650 }, { "epoch": 7.498789663068368, "grad_norm": 0.27352455258369446, "learning_rate": 1.0206889610954774e-05, "loss": 0.028, "step": 28660 }, { "epoch": 7.501406607785411, "grad_norm": 0.2847869098186493, "learning_rate": 1.0190233059722956e-05, "loss": 0.0275, "step": 28670 }, { "epoch": 7.504023552502454, "grad_norm": 0.37741467356681824, "learning_rate": 1.0173586630827186e-05, "loss": 0.0316, "step": 28680 }, { "epoch": 7.506640497219497, "grad_norm": 0.33624228835105896, "learning_rate": 1.015695033564515e-05, "loss": 0.0277, "step": 28690 }, { "epoch": 7.509257441936539, "grad_norm": 0.31950801610946655, "learning_rate": 1.0140324185547594e-05, "loss": 0.0275, "step": 28700 }, { "epoch": 7.511874386653582, "grad_norm": 0.24474136531352997, "learning_rate": 1.0123708191898343e-05, "loss": 0.0288, "step": 28710 }, { "epoch": 7.514491331370625, "grad_norm": 0.38690003752708435, "learning_rate": 1.0107102366054274e-05, "loss": 0.0259, "step": 28720 }, { "epoch": 7.517108276087668, "grad_norm": 0.1933925300836563, "learning_rate": 1.0090506719365298e-05, "loss": 0.0284, "step": 28730 }, { "epoch": 7.519725220804711, "grad_norm": 0.27855780720710754, "learning_rate": 1.0073921263174391e-05, "loss": 0.0263, "step": 28740 }, { "epoch": 7.522342165521754, "grad_norm": 0.23865076899528503, "learning_rate": 1.0057346008817556e-05, "loss": 0.0266, "step": 28750 }, { "epoch": 7.524959110238796, "grad_norm": 0.34086933732032776, "learning_rate": 1.0040780967623833e-05, "loss": 0.0349, "step": 28760 }, { "epoch": 7.527576054955839, "grad_norm": 0.29268211126327515, "learning_rate": 1.0024226150915261e-05, "loss": 0.0283, "step": 28770 }, { "epoch": 7.530192999672882, "grad_norm": 0.24367311596870422, "learning_rate": 1.0007681570006894e-05, "loss": 0.0277, "step": 28780 }, { "epoch": 7.532809944389925, "grad_norm": 0.36311009526252747, "learning_rate": 9.991147236206803e-06, "loss": 0.03, "step": 28790 }, { "epoch": 7.535426889106968, "grad_norm": 0.3049049973487854, "learning_rate": 9.974623160816051e-06, "loss": 0.0272, "step": 28800 }, { "epoch": 7.538043833824011, "grad_norm": 0.4554224908351898, "learning_rate": 9.958109355128689e-06, "loss": 0.0286, "step": 28810 }, { "epoch": 7.540660778541053, "grad_norm": 0.24151623249053955, "learning_rate": 9.941605830431756e-06, "loss": 0.0295, "step": 28820 }, { "epoch": 7.543277723258096, "grad_norm": 0.3120863139629364, "learning_rate": 9.925112598005234e-06, "loss": 0.031, "step": 28830 }, { "epoch": 7.545894667975139, "grad_norm": 0.29075419902801514, "learning_rate": 9.908629669122104e-06, "loss": 0.0255, "step": 28840 }, { "epoch": 7.548511612692182, "grad_norm": 0.20289865136146545, "learning_rate": 9.892157055048304e-06, "loss": 0.0311, "step": 28850 }, { "epoch": 7.551128557409225, "grad_norm": 0.40245625376701355, "learning_rate": 9.875694767042687e-06, "loss": 0.0306, "step": 28860 }, { "epoch": 7.553745502126268, "grad_norm": 0.25056540966033936, "learning_rate": 9.859242816357092e-06, "loss": 0.0276, "step": 28870 }, { "epoch": 7.556362446843311, "grad_norm": 0.2368532121181488, "learning_rate": 9.842801214236255e-06, "loss": 0.0303, "step": 28880 }, { "epoch": 7.558979391560353, "grad_norm": 0.32772502303123474, "learning_rate": 9.826369971917865e-06, "loss": 0.0279, "step": 28890 }, { "epoch": 7.561596336277396, "grad_norm": 0.3745412528514862, "learning_rate": 9.80994910063252e-06, "loss": 0.0268, "step": 28900 }, { "epoch": 7.564213280994439, "grad_norm": 0.2986398935317993, "learning_rate": 9.79353861160373e-06, "loss": 0.0293, "step": 28910 }, { "epoch": 7.566830225711482, "grad_norm": 0.364200621843338, "learning_rate": 9.77713851604792e-06, "loss": 0.0301, "step": 28920 }, { "epoch": 7.569447170428525, "grad_norm": 0.417913556098938, "learning_rate": 9.760748825174382e-06, "loss": 0.0265, "step": 28930 }, { "epoch": 7.572064115145568, "grad_norm": 0.36317500472068787, "learning_rate": 9.744369550185334e-06, "loss": 0.0274, "step": 28940 }, { "epoch": 7.574681059862611, "grad_norm": 0.4232650101184845, "learning_rate": 9.728000702275839e-06, "loss": 0.027, "step": 28950 }, { "epoch": 7.577298004579653, "grad_norm": 0.4161834716796875, "learning_rate": 9.711642292633854e-06, "loss": 0.0313, "step": 28960 }, { "epoch": 7.579914949296696, "grad_norm": 0.40490856766700745, "learning_rate": 9.695294332440214e-06, "loss": 0.0326, "step": 28970 }, { "epoch": 7.582531894013739, "grad_norm": 0.26717275381088257, "learning_rate": 9.678956832868572e-06, "loss": 0.0258, "step": 28980 }, { "epoch": 7.585148838730782, "grad_norm": 0.3033079206943512, "learning_rate": 9.662629805085466e-06, "loss": 0.0271, "step": 28990 }, { "epoch": 7.587765783447825, "grad_norm": 0.4634299874305725, "learning_rate": 9.646313260250267e-06, "loss": 0.0287, "step": 29000 }, { "epoch": 7.587765783447825, "eval_loss": 0.03173584048044022, "eval_runtime": 9.2653, "eval_samples_per_second": 110.519, "eval_steps_per_second": 1.727, "step": 29000 }, { "epoch": 7.590382728164867, "grad_norm": 0.30492672324180603, "learning_rate": 9.630007209515177e-06, "loss": 0.03, "step": 29010 }, { "epoch": 7.59299967288191, "grad_norm": 0.5153994560241699, "learning_rate": 9.613711664025237e-06, "loss": 0.0258, "step": 29020 }, { "epoch": 7.595616617598953, "grad_norm": 0.4768589437007904, "learning_rate": 9.597426634918291e-06, "loss": 0.0279, "step": 29030 }, { "epoch": 7.598233562315996, "grad_norm": 0.3797157406806946, "learning_rate": 9.581152133324994e-06, "loss": 0.0275, "step": 29040 }, { "epoch": 7.600850507033039, "grad_norm": 0.2744821608066559, "learning_rate": 9.564888170368825e-06, "loss": 0.0275, "step": 29050 }, { "epoch": 7.603467451750082, "grad_norm": 0.3670021891593933, "learning_rate": 9.548634757166041e-06, "loss": 0.034, "step": 29060 }, { "epoch": 7.606084396467125, "grad_norm": 0.2540300190448761, "learning_rate": 9.532391904825716e-06, "loss": 0.03, "step": 29070 }, { "epoch": 7.608701341184167, "grad_norm": 0.2962121367454529, "learning_rate": 9.516159624449667e-06, "loss": 0.025, "step": 29080 }, { "epoch": 7.61131828590121, "grad_norm": 0.2839810848236084, "learning_rate": 9.499937927132508e-06, "loss": 0.027, "step": 29090 }, { "epoch": 7.613935230618253, "grad_norm": 0.3168564736843109, "learning_rate": 9.483726823961616e-06, "loss": 0.0262, "step": 29100 }, { "epoch": 7.616552175335296, "grad_norm": 0.3472195863723755, "learning_rate": 9.467526326017135e-06, "loss": 0.0275, "step": 29110 }, { "epoch": 7.619169120052339, "grad_norm": 0.289442777633667, "learning_rate": 9.451336444371953e-06, "loss": 0.0296, "step": 29120 }, { "epoch": 7.621786064769382, "grad_norm": 0.35899093747138977, "learning_rate": 9.435157190091698e-06, "loss": 0.0294, "step": 29130 }, { "epoch": 7.624403009486425, "grad_norm": 0.4903254210948944, "learning_rate": 9.418988574234724e-06, "loss": 0.0282, "step": 29140 }, { "epoch": 7.627019954203467, "grad_norm": 0.32704609632492065, "learning_rate": 9.402830607852145e-06, "loss": 0.031, "step": 29150 }, { "epoch": 7.62963689892051, "grad_norm": 0.3716278374195099, "learning_rate": 9.38668330198777e-06, "loss": 0.029, "step": 29160 }, { "epoch": 7.632253843637553, "grad_norm": 0.3026614487171173, "learning_rate": 9.37054666767814e-06, "loss": 0.0289, "step": 29170 }, { "epoch": 7.634870788354596, "grad_norm": 0.23007415235042572, "learning_rate": 9.35442071595248e-06, "loss": 0.0288, "step": 29180 }, { "epoch": 7.637487733071639, "grad_norm": 0.36757123470306396, "learning_rate": 9.33830545783273e-06, "loss": 0.0278, "step": 29190 }, { "epoch": 7.640104677788682, "grad_norm": 0.3388332426548004, "learning_rate": 9.32220090433352e-06, "loss": 0.0247, "step": 29200 }, { "epoch": 7.642721622505725, "grad_norm": 0.25686201453208923, "learning_rate": 9.306107066462159e-06, "loss": 0.0252, "step": 29210 }, { "epoch": 7.645338567222767, "grad_norm": 0.18186792731285095, "learning_rate": 9.29002395521864e-06, "loss": 0.0273, "step": 29220 }, { "epoch": 7.64795551193981, "grad_norm": 0.26176708936691284, "learning_rate": 9.273951581595614e-06, "loss": 0.0251, "step": 29230 }, { "epoch": 7.650572456656853, "grad_norm": 0.3464919924736023, "learning_rate": 9.257889956578383e-06, "loss": 0.0323, "step": 29240 }, { "epoch": 7.653189401373896, "grad_norm": 0.31770631670951843, "learning_rate": 9.24183909114493e-06, "loss": 0.0272, "step": 29250 }, { "epoch": 7.655806346090939, "grad_norm": 0.39476248621940613, "learning_rate": 9.225798996265867e-06, "loss": 0.0277, "step": 29260 }, { "epoch": 7.658423290807981, "grad_norm": 0.28465256094932556, "learning_rate": 9.209769682904457e-06, "loss": 0.0256, "step": 29270 }, { "epoch": 7.661040235525024, "grad_norm": 0.4322209358215332, "learning_rate": 9.193751162016565e-06, "loss": 0.0288, "step": 29280 }, { "epoch": 7.663657180242067, "grad_norm": 0.3503320515155792, "learning_rate": 9.17774344455071e-06, "loss": 0.0284, "step": 29290 }, { "epoch": 7.66627412495911, "grad_norm": 0.36152294278144836, "learning_rate": 9.161746541448019e-06, "loss": 0.027, "step": 29300 }, { "epoch": 7.668891069676153, "grad_norm": 0.2110384702682495, "learning_rate": 9.145760463642227e-06, "loss": 0.0276, "step": 29310 }, { "epoch": 7.671508014393196, "grad_norm": 0.32499822974205017, "learning_rate": 9.129785222059653e-06, "loss": 0.026, "step": 29320 }, { "epoch": 7.674124959110239, "grad_norm": 0.286526083946228, "learning_rate": 9.113820827619244e-06, "loss": 0.0283, "step": 29330 }, { "epoch": 7.676741903827281, "grad_norm": 0.24248047173023224, "learning_rate": 9.09786729123249e-06, "loss": 0.0253, "step": 29340 }, { "epoch": 7.679358848544324, "grad_norm": 0.38848909735679626, "learning_rate": 9.081924623803495e-06, "loss": 0.032, "step": 29350 }, { "epoch": 7.681975793261367, "grad_norm": 0.3085334002971649, "learning_rate": 9.06599283622892e-06, "loss": 0.0291, "step": 29360 }, { "epoch": 7.68459273797841, "grad_norm": 0.2823046147823334, "learning_rate": 9.050071939398003e-06, "loss": 0.0244, "step": 29370 }, { "epoch": 7.687209682695453, "grad_norm": 0.3811781406402588, "learning_rate": 9.034161944192508e-06, "loss": 0.0268, "step": 29380 }, { "epoch": 7.689826627412496, "grad_norm": 0.2050655484199524, "learning_rate": 9.018262861486776e-06, "loss": 0.0235, "step": 29390 }, { "epoch": 7.692443572129539, "grad_norm": 0.2599548399448395, "learning_rate": 9.002374702147676e-06, "loss": 0.0291, "step": 29400 }, { "epoch": 7.695060516846581, "grad_norm": 0.2197568714618683, "learning_rate": 8.98649747703463e-06, "loss": 0.0246, "step": 29410 }, { "epoch": 7.697677461563624, "grad_norm": 0.23569698631763458, "learning_rate": 8.970631196999552e-06, "loss": 0.0289, "step": 29420 }, { "epoch": 7.700294406280667, "grad_norm": 0.48520246148109436, "learning_rate": 8.954775872886908e-06, "loss": 0.0294, "step": 29430 }, { "epoch": 7.70291135099771, "grad_norm": 0.23008428514003754, "learning_rate": 8.938931515533652e-06, "loss": 0.0306, "step": 29440 }, { "epoch": 7.705528295714753, "grad_norm": 0.4600384831428528, "learning_rate": 8.923098135769258e-06, "loss": 0.022, "step": 29450 }, { "epoch": 7.708145240431795, "grad_norm": 0.39224159717559814, "learning_rate": 8.907275744415692e-06, "loss": 0.0263, "step": 29460 }, { "epoch": 7.710762185148838, "grad_norm": 0.24442845582962036, "learning_rate": 8.891464352287418e-06, "loss": 0.028, "step": 29470 }, { "epoch": 7.713379129865881, "grad_norm": 0.25245606899261475, "learning_rate": 8.875663970191356e-06, "loss": 0.0281, "step": 29480 }, { "epoch": 7.715996074582924, "grad_norm": 0.28361907601356506, "learning_rate": 8.859874608926928e-06, "loss": 0.0254, "step": 29490 }, { "epoch": 7.718613019299967, "grad_norm": 0.23908400535583496, "learning_rate": 8.844096279286019e-06, "loss": 0.0278, "step": 29500 }, { "epoch": 7.72122996401701, "grad_norm": 0.22012092173099518, "learning_rate": 8.828328992052953e-06, "loss": 0.0245, "step": 29510 }, { "epoch": 7.723846908734053, "grad_norm": 0.4234012961387634, "learning_rate": 8.812572758004534e-06, "loss": 0.0319, "step": 29520 }, { "epoch": 7.726463853451095, "grad_norm": 0.3428882360458374, "learning_rate": 8.796827587910003e-06, "loss": 0.0263, "step": 29530 }, { "epoch": 7.729080798168138, "grad_norm": 0.4102848768234253, "learning_rate": 8.781093492531023e-06, "loss": 0.0262, "step": 29540 }, { "epoch": 7.731697742885181, "grad_norm": 0.39069345593452454, "learning_rate": 8.765370482621701e-06, "loss": 0.0251, "step": 29550 }, { "epoch": 7.734314687602224, "grad_norm": 0.31306493282318115, "learning_rate": 8.749658568928577e-06, "loss": 0.024, "step": 29560 }, { "epoch": 7.736931632319267, "grad_norm": 0.29695364832878113, "learning_rate": 8.733957762190592e-06, "loss": 0.0275, "step": 29570 }, { "epoch": 7.73954857703631, "grad_norm": 0.3999333679676056, "learning_rate": 8.718268073139091e-06, "loss": 0.0277, "step": 29580 }, { "epoch": 7.742165521753353, "grad_norm": 0.2966053783893585, "learning_rate": 8.702589512497844e-06, "loss": 0.0286, "step": 29590 }, { "epoch": 7.7447824664703955, "grad_norm": 0.4280104339122772, "learning_rate": 8.68692209098298e-06, "loss": 0.024, "step": 29600 }, { "epoch": 7.7473994111874385, "grad_norm": 0.2608658969402313, "learning_rate": 8.671265819303046e-06, "loss": 0.0272, "step": 29610 }, { "epoch": 7.7500163559044815, "grad_norm": 0.24662499129772186, "learning_rate": 8.655620708158948e-06, "loss": 0.023, "step": 29620 }, { "epoch": 7.7526333006215244, "grad_norm": 0.23771394789218903, "learning_rate": 8.639986768243991e-06, "loss": 0.026, "step": 29630 }, { "epoch": 7.755250245338567, "grad_norm": 0.2451602816581726, "learning_rate": 8.624364010243805e-06, "loss": 0.0312, "step": 29640 }, { "epoch": 7.75786719005561, "grad_norm": 0.19674725830554962, "learning_rate": 8.608752444836401e-06, "loss": 0.0256, "step": 29650 }, { "epoch": 7.760484134772653, "grad_norm": 0.2612418532371521, "learning_rate": 8.593152082692143e-06, "loss": 0.0278, "step": 29660 }, { "epoch": 7.7631010794896955, "grad_norm": 0.2845847010612488, "learning_rate": 8.577562934473737e-06, "loss": 0.0232, "step": 29670 }, { "epoch": 7.7657180242067385, "grad_norm": 0.2566286325454712, "learning_rate": 8.561985010836202e-06, "loss": 0.0235, "step": 29680 }, { "epoch": 7.7683349689237815, "grad_norm": 0.20483295619487762, "learning_rate": 8.54641832242692e-06, "loss": 0.0298, "step": 29690 }, { "epoch": 7.7709519136408245, "grad_norm": 0.22542567551136017, "learning_rate": 8.530862879885556e-06, "loss": 0.0257, "step": 29700 }, { "epoch": 7.7735688583578675, "grad_norm": 0.21155819296836853, "learning_rate": 8.515318693844124e-06, "loss": 0.0292, "step": 29710 }, { "epoch": 7.77618580307491, "grad_norm": 0.27790671586990356, "learning_rate": 8.499785774926918e-06, "loss": 0.0226, "step": 29720 }, { "epoch": 7.778802747791953, "grad_norm": 0.37075120210647583, "learning_rate": 8.48426413375056e-06, "loss": 0.0259, "step": 29730 }, { "epoch": 7.781419692508996, "grad_norm": 0.23859499394893646, "learning_rate": 8.468753780923922e-06, "loss": 0.0244, "step": 29740 }, { "epoch": 7.784036637226039, "grad_norm": 0.3619495630264282, "learning_rate": 8.453254727048193e-06, "loss": 0.0262, "step": 29750 }, { "epoch": 7.7866535819430815, "grad_norm": 0.2726457715034485, "learning_rate": 8.437766982716835e-06, "loss": 0.0277, "step": 29760 }, { "epoch": 7.7892705266601245, "grad_norm": 0.3399900794029236, "learning_rate": 8.422290558515577e-06, "loss": 0.0293, "step": 29770 }, { "epoch": 7.7918874713771675, "grad_norm": 0.2570652663707733, "learning_rate": 8.406825465022405e-06, "loss": 0.0258, "step": 29780 }, { "epoch": 7.79450441609421, "grad_norm": 0.3375178575515747, "learning_rate": 8.391371712807556e-06, "loss": 0.0293, "step": 29790 }, { "epoch": 7.797121360811253, "grad_norm": 0.4316467344760895, "learning_rate": 8.37592931243353e-06, "loss": 0.0282, "step": 29800 }, { "epoch": 7.799738305528296, "grad_norm": 0.292636513710022, "learning_rate": 8.360498274455064e-06, "loss": 0.0302, "step": 29810 }, { "epoch": 7.802355250245339, "grad_norm": 0.3243941366672516, "learning_rate": 8.345078609419124e-06, "loss": 0.0228, "step": 29820 }, { "epoch": 7.804972194962382, "grad_norm": 0.2369060516357422, "learning_rate": 8.32967032786492e-06, "loss": 0.0261, "step": 29830 }, { "epoch": 7.807589139679425, "grad_norm": 0.5797624588012695, "learning_rate": 8.314273440323844e-06, "loss": 0.0255, "step": 29840 }, { "epoch": 7.810206084396468, "grad_norm": 0.3765929043292999, "learning_rate": 8.298887957319538e-06, "loss": 0.027, "step": 29850 }, { "epoch": 7.81282302911351, "grad_norm": 0.2738799452781677, "learning_rate": 8.283513889367827e-06, "loss": 0.0302, "step": 29860 }, { "epoch": 7.815439973830553, "grad_norm": 0.3794039189815521, "learning_rate": 8.268151246976755e-06, "loss": 0.0282, "step": 29870 }, { "epoch": 7.818056918547596, "grad_norm": 0.3417680263519287, "learning_rate": 8.252800040646536e-06, "loss": 0.0278, "step": 29880 }, { "epoch": 7.820673863264639, "grad_norm": 0.2978939414024353, "learning_rate": 8.23746028086956e-06, "loss": 0.026, "step": 29890 }, { "epoch": 7.823290807981682, "grad_norm": 0.2458307147026062, "learning_rate": 8.222131978130424e-06, "loss": 0.0314, "step": 29900 }, { "epoch": 7.825907752698724, "grad_norm": 0.47884535789489746, "learning_rate": 8.206815142905875e-06, "loss": 0.0277, "step": 29910 }, { "epoch": 7.828524697415767, "grad_norm": 0.5162719488143921, "learning_rate": 8.191509785664825e-06, "loss": 0.0256, "step": 29920 }, { "epoch": 7.83114164213281, "grad_norm": 0.41363653540611267, "learning_rate": 8.176215916868351e-06, "loss": 0.031, "step": 29930 }, { "epoch": 7.833758586849853, "grad_norm": 0.455657422542572, "learning_rate": 8.160933546969649e-06, "loss": 0.0289, "step": 29940 }, { "epoch": 7.836375531566896, "grad_norm": 0.39834895730018616, "learning_rate": 8.145662686414086e-06, "loss": 0.0271, "step": 29950 }, { "epoch": 7.838992476283939, "grad_norm": 0.23709778487682343, "learning_rate": 8.13040334563915e-06, "loss": 0.0278, "step": 29960 }, { "epoch": 7.841609421000982, "grad_norm": 0.46160802245140076, "learning_rate": 8.115155535074465e-06, "loss": 0.026, "step": 29970 }, { "epoch": 7.844226365718024, "grad_norm": 0.2947714626789093, "learning_rate": 8.099919265141755e-06, "loss": 0.0247, "step": 29980 }, { "epoch": 7.846843310435067, "grad_norm": 0.31289541721343994, "learning_rate": 8.084694546254862e-06, "loss": 0.0234, "step": 29990 }, { "epoch": 7.84946025515211, "grad_norm": 0.4130816161632538, "learning_rate": 8.069481388819747e-06, "loss": 0.0246, "step": 30000 }, { "epoch": 7.84946025515211, "eval_loss": 0.030050184655183172, "eval_runtime": 9.5757, "eval_samples_per_second": 106.938, "eval_steps_per_second": 1.671, "step": 30000 }, { "epoch": 7.852077199869153, "grad_norm": 0.3342052102088928, "learning_rate": 8.054279803234455e-06, "loss": 0.0265, "step": 30010 }, { "epoch": 7.854694144586196, "grad_norm": 0.265415757894516, "learning_rate": 8.03908979988913e-06, "loss": 0.0313, "step": 30020 }, { "epoch": 7.857311089303239, "grad_norm": 0.2864171266555786, "learning_rate": 8.023911389166002e-06, "loss": 0.0284, "step": 30030 }, { "epoch": 7.859928034020282, "grad_norm": 0.551828920841217, "learning_rate": 8.008744581439357e-06, "loss": 0.029, "step": 30040 }, { "epoch": 7.862544978737324, "grad_norm": 0.2644766569137573, "learning_rate": 7.993589387075574e-06, "loss": 0.025, "step": 30050 }, { "epoch": 7.865161923454367, "grad_norm": 0.34633108973503113, "learning_rate": 7.978445816433092e-06, "loss": 0.0251, "step": 30060 }, { "epoch": 7.86777886817141, "grad_norm": 0.32692670822143555, "learning_rate": 7.96331387986238e-06, "loss": 0.0262, "step": 30070 }, { "epoch": 7.870395812888453, "grad_norm": 0.34559890627861023, "learning_rate": 7.948193587705993e-06, "loss": 0.031, "step": 30080 }, { "epoch": 7.873012757605496, "grad_norm": 0.29954639077186584, "learning_rate": 7.933084950298495e-06, "loss": 0.0229, "step": 30090 }, { "epoch": 7.875629702322539, "grad_norm": 0.3873235881328583, "learning_rate": 7.917987977966501e-06, "loss": 0.0333, "step": 30100 }, { "epoch": 7.878246647039582, "grad_norm": 0.2134721726179123, "learning_rate": 7.902902681028648e-06, "loss": 0.0267, "step": 30110 }, { "epoch": 7.880863591756624, "grad_norm": 0.3731750249862671, "learning_rate": 7.8878290697956e-06, "loss": 0.0231, "step": 30120 }, { "epoch": 7.883480536473667, "grad_norm": 0.33244821429252625, "learning_rate": 7.87276715457003e-06, "loss": 0.0291, "step": 30130 }, { "epoch": 7.88609748119071, "grad_norm": 0.3392798602581024, "learning_rate": 7.857716945646603e-06, "loss": 0.0292, "step": 30140 }, { "epoch": 7.888714425907753, "grad_norm": 0.291120707988739, "learning_rate": 7.842678453312008e-06, "loss": 0.0269, "step": 30150 }, { "epoch": 7.891331370624796, "grad_norm": 0.28149276971817017, "learning_rate": 7.8276516878449e-06, "loss": 0.029, "step": 30160 }, { "epoch": 7.893948315341838, "grad_norm": 0.2638510763645172, "learning_rate": 7.812636659515937e-06, "loss": 0.0235, "step": 30170 }, { "epoch": 7.896565260058881, "grad_norm": 0.26685631275177, "learning_rate": 7.797633378587759e-06, "loss": 0.0266, "step": 30180 }, { "epoch": 7.899182204775924, "grad_norm": 0.27392229437828064, "learning_rate": 7.78264185531495e-06, "loss": 0.023, "step": 30190 }, { "epoch": 7.901799149492967, "grad_norm": 0.33641985058784485, "learning_rate": 7.767662099944082e-06, "loss": 0.0242, "step": 30200 }, { "epoch": 7.90441609421001, "grad_norm": 0.32816699147224426, "learning_rate": 7.752694122713678e-06, "loss": 0.0249, "step": 30210 }, { "epoch": 7.907033038927053, "grad_norm": 0.31403347849845886, "learning_rate": 7.737737933854209e-06, "loss": 0.0265, "step": 30220 }, { "epoch": 7.909649983644096, "grad_norm": 0.24018634855747223, "learning_rate": 7.722793543588097e-06, "loss": 0.0218, "step": 30230 }, { "epoch": 7.912266928361138, "grad_norm": 0.21781381964683533, "learning_rate": 7.707860962129673e-06, "loss": 0.0306, "step": 30240 }, { "epoch": 7.914883873078181, "grad_norm": 0.33943313360214233, "learning_rate": 7.692940199685236e-06, "loss": 0.0306, "step": 30250 }, { "epoch": 7.917500817795224, "grad_norm": 0.3407123386859894, "learning_rate": 7.67803126645297e-06, "loss": 0.0259, "step": 30260 }, { "epoch": 7.920117762512267, "grad_norm": 0.32126396894454956, "learning_rate": 7.663134172622996e-06, "loss": 0.0282, "step": 30270 }, { "epoch": 7.92273470722931, "grad_norm": 0.42551764845848083, "learning_rate": 7.648248928377347e-06, "loss": 0.0269, "step": 30280 }, { "epoch": 7.925351651946353, "grad_norm": 0.2464340627193451, "learning_rate": 7.633375543889929e-06, "loss": 0.0246, "step": 30290 }, { "epoch": 7.927968596663396, "grad_norm": 0.2966938316822052, "learning_rate": 7.618514029326571e-06, "loss": 0.0246, "step": 30300 }, { "epoch": 7.930585541380438, "grad_norm": 0.27248331904411316, "learning_rate": 7.603664394844973e-06, "loss": 0.0232, "step": 30310 }, { "epoch": 7.933202486097481, "grad_norm": 0.322000116109848, "learning_rate": 7.588826650594727e-06, "loss": 0.0238, "step": 30320 }, { "epoch": 7.935819430814524, "grad_norm": 0.33744585514068604, "learning_rate": 7.574000806717293e-06, "loss": 0.0265, "step": 30330 }, { "epoch": 7.938436375531567, "grad_norm": 0.17370611429214478, "learning_rate": 7.5591868733459915e-06, "loss": 0.0284, "step": 30340 }, { "epoch": 7.94105332024861, "grad_norm": 0.2931711673736572, "learning_rate": 7.544384860605996e-06, "loss": 0.0291, "step": 30350 }, { "epoch": 7.943670264965653, "grad_norm": 0.25150954723358154, "learning_rate": 7.529594778614355e-06, "loss": 0.0227, "step": 30360 }, { "epoch": 7.946287209682695, "grad_norm": 0.6533074975013733, "learning_rate": 7.514816637479943e-06, "loss": 0.0247, "step": 30370 }, { "epoch": 7.948904154399738, "grad_norm": 0.2975616455078125, "learning_rate": 7.500050447303494e-06, "loss": 0.0246, "step": 30380 }, { "epoch": 7.951521099116781, "grad_norm": 0.37849241495132446, "learning_rate": 7.4852962181775396e-06, "loss": 0.026, "step": 30390 }, { "epoch": 7.954138043833824, "grad_norm": 0.33247023820877075, "learning_rate": 7.470553960186469e-06, "loss": 0.0265, "step": 30400 }, { "epoch": 7.956754988550867, "grad_norm": 0.2878034710884094, "learning_rate": 7.455823683406474e-06, "loss": 0.0318, "step": 30410 }, { "epoch": 7.95937193326791, "grad_norm": 0.2719588577747345, "learning_rate": 7.441105397905557e-06, "loss": 0.0252, "step": 30420 }, { "epoch": 7.961988877984952, "grad_norm": 0.33773666620254517, "learning_rate": 7.42639911374354e-06, "loss": 0.0257, "step": 30430 }, { "epoch": 7.964605822701995, "grad_norm": 0.34900739789009094, "learning_rate": 7.411704840972017e-06, "loss": 0.0228, "step": 30440 }, { "epoch": 7.967222767419038, "grad_norm": 0.17360925674438477, "learning_rate": 7.397022589634381e-06, "loss": 0.022, "step": 30450 }, { "epoch": 7.969839712136081, "grad_norm": 0.29911497235298157, "learning_rate": 7.382352369765821e-06, "loss": 0.0271, "step": 30460 }, { "epoch": 7.972456656853124, "grad_norm": 0.2711620032787323, "learning_rate": 7.36769419139329e-06, "loss": 0.0233, "step": 30470 }, { "epoch": 7.975073601570167, "grad_norm": 0.18015000224113464, "learning_rate": 7.353048064535523e-06, "loss": 0.0226, "step": 30480 }, { "epoch": 7.97769054628721, "grad_norm": 0.21266216039657593, "learning_rate": 7.338413999202998e-06, "loss": 0.0254, "step": 30490 }, { "epoch": 7.980307491004252, "grad_norm": 0.3124334514141083, "learning_rate": 7.323792005397964e-06, "loss": 0.0226, "step": 30500 }, { "epoch": 7.982924435721295, "grad_norm": 0.1995360255241394, "learning_rate": 7.309182093114417e-06, "loss": 0.0203, "step": 30510 }, { "epoch": 7.985541380438338, "grad_norm": 0.2655145227909088, "learning_rate": 7.2945842723381035e-06, "loss": 0.0248, "step": 30520 }, { "epoch": 7.988158325155381, "grad_norm": 0.34035325050354004, "learning_rate": 7.27999855304648e-06, "loss": 0.0223, "step": 30530 }, { "epoch": 7.990775269872424, "grad_norm": 0.21241000294685364, "learning_rate": 7.265424945208765e-06, "loss": 0.0251, "step": 30540 }, { "epoch": 7.993392214589467, "grad_norm": 0.29762494564056396, "learning_rate": 7.250863458785864e-06, "loss": 0.0269, "step": 30550 }, { "epoch": 7.99600915930651, "grad_norm": 0.28159022331237793, "learning_rate": 7.236314103730424e-06, "loss": 0.0247, "step": 30560 }, { "epoch": 7.998626104023552, "grad_norm": 0.39448240399360657, "learning_rate": 7.221776889986792e-06, "loss": 0.0255, "step": 30570 }, { "epoch": 8.001046777886817, "grad_norm": 0.22735320031642914, "learning_rate": 7.2072518274910185e-06, "loss": 0.0209, "step": 30580 }, { "epoch": 8.00366372260386, "grad_norm": 0.2990175187587738, "learning_rate": 7.192738926170853e-06, "loss": 0.0237, "step": 30590 }, { "epoch": 8.006280667320903, "grad_norm": 0.25007107853889465, "learning_rate": 7.1782381959457105e-06, "loss": 0.0262, "step": 30600 }, { "epoch": 8.008897612037945, "grad_norm": 0.2612306475639343, "learning_rate": 7.1637496467267115e-06, "loss": 0.0296, "step": 30610 }, { "epoch": 8.011514556754989, "grad_norm": 0.327019065618515, "learning_rate": 7.149273288416652e-06, "loss": 0.0241, "step": 30620 }, { "epoch": 8.014131501472031, "grad_norm": 0.3024859130382538, "learning_rate": 7.13480913090997e-06, "loss": 0.0263, "step": 30630 }, { "epoch": 8.016748446189075, "grad_norm": 0.302470862865448, "learning_rate": 7.120357184092796e-06, "loss": 0.0219, "step": 30640 }, { "epoch": 8.019365390906117, "grad_norm": 0.19182458519935608, "learning_rate": 7.1059174578428835e-06, "loss": 0.0227, "step": 30650 }, { "epoch": 8.02198233562316, "grad_norm": 0.2500637173652649, "learning_rate": 7.091489962029657e-06, "loss": 0.0235, "step": 30660 }, { "epoch": 8.024599280340203, "grad_norm": 0.173533633351326, "learning_rate": 7.077074706514175e-06, "loss": 0.0238, "step": 30670 }, { "epoch": 8.027216225057245, "grad_norm": 0.3211864233016968, "learning_rate": 7.0626717011491285e-06, "loss": 0.0269, "step": 30680 }, { "epoch": 8.029833169774289, "grad_norm": 0.2305508255958557, "learning_rate": 7.048280955778844e-06, "loss": 0.0272, "step": 30690 }, { "epoch": 8.032450114491331, "grad_norm": 0.25919488072395325, "learning_rate": 7.0339024802392404e-06, "loss": 0.0252, "step": 30700 }, { "epoch": 8.035067059208375, "grad_norm": 0.2955307960510254, "learning_rate": 7.019536284357892e-06, "loss": 0.0257, "step": 30710 }, { "epoch": 8.037684003925417, "grad_norm": 0.24567954242229462, "learning_rate": 7.00518237795394e-06, "loss": 0.0276, "step": 30720 }, { "epoch": 8.04030094864246, "grad_norm": 0.2553439438343048, "learning_rate": 6.9908407708381505e-06, "loss": 0.0246, "step": 30730 }, { "epoch": 8.042917893359503, "grad_norm": 0.223582461476326, "learning_rate": 6.976511472812886e-06, "loss": 0.0258, "step": 30740 }, { "epoch": 8.045534838076545, "grad_norm": 0.40044087171554565, "learning_rate": 6.962194493672069e-06, "loss": 0.0297, "step": 30750 }, { "epoch": 8.048151782793589, "grad_norm": 0.23338930308818817, "learning_rate": 6.947889843201233e-06, "loss": 0.0241, "step": 30760 }, { "epoch": 8.050768727510631, "grad_norm": 0.23881715536117554, "learning_rate": 6.933597531177466e-06, "loss": 0.0251, "step": 30770 }, { "epoch": 8.053385672227675, "grad_norm": 0.3370324969291687, "learning_rate": 6.919317567369429e-06, "loss": 0.027, "step": 30780 }, { "epoch": 8.056002616944717, "grad_norm": 0.24091914296150208, "learning_rate": 6.905049961537352e-06, "loss": 0.0227, "step": 30790 }, { "epoch": 8.058619561661759, "grad_norm": 0.24746572971343994, "learning_rate": 6.890794723433003e-06, "loss": 0.0237, "step": 30800 }, { "epoch": 8.061236506378803, "grad_norm": 0.2858780324459076, "learning_rate": 6.8765518627996936e-06, "loss": 0.027, "step": 30810 }, { "epoch": 8.063853451095845, "grad_norm": 0.2548196315765381, "learning_rate": 6.8623213893722895e-06, "loss": 0.027, "step": 30820 }, { "epoch": 8.066470395812889, "grad_norm": 0.24900494515895844, "learning_rate": 6.848103312877188e-06, "loss": 0.0227, "step": 30830 }, { "epoch": 8.069087340529931, "grad_norm": 0.2937477231025696, "learning_rate": 6.833897643032319e-06, "loss": 0.0259, "step": 30840 }, { "epoch": 8.071704285246975, "grad_norm": 0.25163620710372925, "learning_rate": 6.819704389547108e-06, "loss": 0.0222, "step": 30850 }, { "epoch": 8.074321229964017, "grad_norm": 0.1904536336660385, "learning_rate": 6.805523562122515e-06, "loss": 0.0237, "step": 30860 }, { "epoch": 8.07693817468106, "grad_norm": 0.24911633133888245, "learning_rate": 6.791355170451005e-06, "loss": 0.0226, "step": 30870 }, { "epoch": 8.079555119398103, "grad_norm": 0.27910029888153076, "learning_rate": 6.777199224216538e-06, "loss": 0.023, "step": 30880 }, { "epoch": 8.082172064115145, "grad_norm": 0.31502899527549744, "learning_rate": 6.763055733094578e-06, "loss": 0.0228, "step": 30890 }, { "epoch": 8.084789008832189, "grad_norm": 0.3428595960140228, "learning_rate": 6.7489247067520606e-06, "loss": 0.0306, "step": 30900 }, { "epoch": 8.087405953549231, "grad_norm": 0.20517821609973907, "learning_rate": 6.734806154847401e-06, "loss": 0.0261, "step": 30910 }, { "epoch": 8.090022898266275, "grad_norm": 0.28139781951904297, "learning_rate": 6.720700087030504e-06, "loss": 0.0224, "step": 30920 }, { "epoch": 8.092639842983317, "grad_norm": 0.34407392144203186, "learning_rate": 6.706606512942734e-06, "loss": 0.0231, "step": 30930 }, { "epoch": 8.09525678770036, "grad_norm": 0.29121291637420654, "learning_rate": 6.6925254422169265e-06, "loss": 0.024, "step": 30940 }, { "epoch": 8.097873732417403, "grad_norm": 0.23466309905052185, "learning_rate": 6.678456884477338e-06, "loss": 0.0258, "step": 30950 }, { "epoch": 8.100490677134445, "grad_norm": 0.3875909447669983, "learning_rate": 6.664400849339708e-06, "loss": 0.0281, "step": 30960 }, { "epoch": 8.103107621851489, "grad_norm": 0.2323644906282425, "learning_rate": 6.6503573464112065e-06, "loss": 0.0239, "step": 30970 }, { "epoch": 8.105724566568531, "grad_norm": 0.41235247254371643, "learning_rate": 6.636326385290429e-06, "loss": 0.0228, "step": 30980 }, { "epoch": 8.108341511285573, "grad_norm": 0.3516373038291931, "learning_rate": 6.6223079755674154e-06, "loss": 0.0254, "step": 30990 }, { "epoch": 8.110958456002617, "grad_norm": 0.4276266098022461, "learning_rate": 6.608302126823609e-06, "loss": 0.0244, "step": 31000 }, { "epoch": 8.110958456002617, "eval_loss": 0.029148803018330102, "eval_runtime": 9.3161, "eval_samples_per_second": 109.917, "eval_steps_per_second": 1.717, "step": 31000 }, { "epoch": 8.11357540071966, "grad_norm": 0.275944322347641, "learning_rate": 6.594308848631869e-06, "loss": 0.0228, "step": 31010 }, { "epoch": 8.116192345436703, "grad_norm": 0.3111572265625, "learning_rate": 6.580328150556478e-06, "loss": 0.0261, "step": 31020 }, { "epoch": 8.118809290153745, "grad_norm": 0.22269070148468018, "learning_rate": 6.5663600421531055e-06, "loss": 0.0225, "step": 31030 }, { "epoch": 8.121426234870789, "grad_norm": 0.21635177731513977, "learning_rate": 6.552404532968834e-06, "loss": 0.0256, "step": 31040 }, { "epoch": 8.124043179587831, "grad_norm": 0.24473895132541656, "learning_rate": 6.538461632542106e-06, "loss": 0.0292, "step": 31050 }, { "epoch": 8.126660124304873, "grad_norm": 0.25067785382270813, "learning_rate": 6.524531350402771e-06, "loss": 0.0233, "step": 31060 }, { "epoch": 8.129277069021917, "grad_norm": 0.3121054470539093, "learning_rate": 6.510613696072046e-06, "loss": 0.0255, "step": 31070 }, { "epoch": 8.13189401373896, "grad_norm": 0.4683177173137665, "learning_rate": 6.4967086790625185e-06, "loss": 0.0256, "step": 31080 }, { "epoch": 8.134510958456003, "grad_norm": 0.25463202595710754, "learning_rate": 6.482816308878129e-06, "loss": 0.0262, "step": 31090 }, { "epoch": 8.137127903173045, "grad_norm": 0.254255086183548, "learning_rate": 6.468936595014194e-06, "loss": 0.0237, "step": 31100 }, { "epoch": 8.13974484789009, "grad_norm": 0.3515304923057556, "learning_rate": 6.4550695469573485e-06, "loss": 0.0272, "step": 31110 }, { "epoch": 8.142361792607131, "grad_norm": 0.19320529699325562, "learning_rate": 6.441215174185602e-06, "loss": 0.0249, "step": 31120 }, { "epoch": 8.144978737324173, "grad_norm": 0.27671656012535095, "learning_rate": 6.427373486168284e-06, "loss": 0.0262, "step": 31130 }, { "epoch": 8.147595682041217, "grad_norm": 0.23765401542186737, "learning_rate": 6.413544492366066e-06, "loss": 0.0254, "step": 31140 }, { "epoch": 8.15021262675826, "grad_norm": 0.3123226761817932, "learning_rate": 6.39972820223092e-06, "loss": 0.0241, "step": 31150 }, { "epoch": 8.152829571475303, "grad_norm": 0.3324793577194214, "learning_rate": 6.385924625206158e-06, "loss": 0.0224, "step": 31160 }, { "epoch": 8.155446516192345, "grad_norm": 0.35988208651542664, "learning_rate": 6.372133770726396e-06, "loss": 0.0244, "step": 31170 }, { "epoch": 8.15806346090939, "grad_norm": 0.4930897057056427, "learning_rate": 6.358355648217556e-06, "loss": 0.0266, "step": 31180 }, { "epoch": 8.160680405626431, "grad_norm": 0.4564793109893799, "learning_rate": 6.344590267096845e-06, "loss": 0.0256, "step": 31190 }, { "epoch": 8.163297350343473, "grad_norm": 0.4039897322654724, "learning_rate": 6.330837636772782e-06, "loss": 0.0293, "step": 31200 }, { "epoch": 8.165914295060517, "grad_norm": 0.40973928570747375, "learning_rate": 6.31709776664515e-06, "loss": 0.0244, "step": 31210 }, { "epoch": 8.16853123977756, "grad_norm": 0.5403375625610352, "learning_rate": 6.303370666105024e-06, "loss": 0.0239, "step": 31220 }, { "epoch": 8.171148184494603, "grad_norm": 0.30449724197387695, "learning_rate": 6.289656344534747e-06, "loss": 0.0251, "step": 31230 }, { "epoch": 8.173765129211645, "grad_norm": 0.23199830949306488, "learning_rate": 6.275954811307941e-06, "loss": 0.022, "step": 31240 }, { "epoch": 8.176382073928687, "grad_norm": 0.12945488095283508, "learning_rate": 6.262266075789455e-06, "loss": 0.0204, "step": 31250 }, { "epoch": 8.178999018645731, "grad_norm": 0.3655667006969452, "learning_rate": 6.2485901473354205e-06, "loss": 0.029, "step": 31260 }, { "epoch": 8.181615963362773, "grad_norm": 0.3365057110786438, "learning_rate": 6.234927035293212e-06, "loss": 0.0222, "step": 31270 }, { "epoch": 8.184232908079817, "grad_norm": 0.2938461899757385, "learning_rate": 6.2212767490014225e-06, "loss": 0.0269, "step": 31280 }, { "epoch": 8.18684985279686, "grad_norm": 0.22843977808952332, "learning_rate": 6.207639297789905e-06, "loss": 0.0264, "step": 31290 }, { "epoch": 8.189466797513903, "grad_norm": 0.21421143412590027, "learning_rate": 6.19401469097973e-06, "loss": 0.0233, "step": 31300 }, { "epoch": 8.192083742230945, "grad_norm": 0.28643175959587097, "learning_rate": 6.1804029378831785e-06, "loss": 0.0214, "step": 31310 }, { "epoch": 8.194700686947987, "grad_norm": 0.3194423317909241, "learning_rate": 6.166804047803762e-06, "loss": 0.0246, "step": 31320 }, { "epoch": 8.197317631665031, "grad_norm": 0.23330186307430267, "learning_rate": 6.15321803003619e-06, "loss": 0.0258, "step": 31330 }, { "epoch": 8.199934576382073, "grad_norm": 0.2894728183746338, "learning_rate": 6.139644893866389e-06, "loss": 0.0266, "step": 31340 }, { "epoch": 8.202551521099117, "grad_norm": 0.2600817084312439, "learning_rate": 6.126084648571453e-06, "loss": 0.0252, "step": 31350 }, { "epoch": 8.20516846581616, "grad_norm": 0.31294360756874084, "learning_rate": 6.112537303419696e-06, "loss": 0.0244, "step": 31360 }, { "epoch": 8.207785410533203, "grad_norm": 0.39260825514793396, "learning_rate": 6.0990028676705866e-06, "loss": 0.0266, "step": 31370 }, { "epoch": 8.210402355250245, "grad_norm": 0.21804678440093994, "learning_rate": 6.085481350574792e-06, "loss": 0.024, "step": 31380 }, { "epoch": 8.213019299967288, "grad_norm": 0.4497775435447693, "learning_rate": 6.071972761374142e-06, "loss": 0.0245, "step": 31390 }, { "epoch": 8.215636244684331, "grad_norm": 0.3159434199333191, "learning_rate": 6.058477109301633e-06, "loss": 0.0246, "step": 31400 }, { "epoch": 8.218253189401374, "grad_norm": 0.4656051695346832, "learning_rate": 6.044994403581408e-06, "loss": 0.0257, "step": 31410 }, { "epoch": 8.220870134118417, "grad_norm": 0.31082114577293396, "learning_rate": 6.031524653428772e-06, "loss": 0.025, "step": 31420 }, { "epoch": 8.22348707883546, "grad_norm": 0.35635101795196533, "learning_rate": 6.018067868050173e-06, "loss": 0.0233, "step": 31430 }, { "epoch": 8.226104023552502, "grad_norm": 0.36490118503570557, "learning_rate": 6.004624056643205e-06, "loss": 0.026, "step": 31440 }, { "epoch": 8.228720968269545, "grad_norm": 0.37780648469924927, "learning_rate": 5.991193228396571e-06, "loss": 0.0237, "step": 31450 }, { "epoch": 8.231337912986588, "grad_norm": 0.23720017075538635, "learning_rate": 5.977775392490128e-06, "loss": 0.0215, "step": 31460 }, { "epoch": 8.233954857703631, "grad_norm": 0.22219017148017883, "learning_rate": 5.964370558094831e-06, "loss": 0.0218, "step": 31470 }, { "epoch": 8.236571802420674, "grad_norm": 0.19312626123428345, "learning_rate": 5.950978734372764e-06, "loss": 0.0251, "step": 31480 }, { "epoch": 8.239188747137717, "grad_norm": 0.2890303432941437, "learning_rate": 5.937599930477108e-06, "loss": 0.0274, "step": 31490 }, { "epoch": 8.24180569185476, "grad_norm": 0.19071075320243835, "learning_rate": 5.924234155552158e-06, "loss": 0.0226, "step": 31500 }, { "epoch": 8.244422636571802, "grad_norm": 0.2616288363933563, "learning_rate": 5.910881418733283e-06, "loss": 0.023, "step": 31510 }, { "epoch": 8.247039581288846, "grad_norm": 0.22435951232910156, "learning_rate": 5.89754172914696e-06, "loss": 0.023, "step": 31520 }, { "epoch": 8.249656526005888, "grad_norm": 0.2875134348869324, "learning_rate": 5.884215095910739e-06, "loss": 0.0244, "step": 31530 }, { "epoch": 8.252273470722931, "grad_norm": 0.2992618680000305, "learning_rate": 5.870901528133255e-06, "loss": 0.0219, "step": 31540 }, { "epoch": 8.254890415439974, "grad_norm": 0.5597621202468872, "learning_rate": 5.857601034914201e-06, "loss": 0.0256, "step": 31550 }, { "epoch": 8.257507360157017, "grad_norm": 0.3123728632926941, "learning_rate": 5.844313625344331e-06, "loss": 0.0224, "step": 31560 }, { "epoch": 8.26012430487406, "grad_norm": 0.1956491321325302, "learning_rate": 5.831039308505467e-06, "loss": 0.0237, "step": 31570 }, { "epoch": 8.262741249591102, "grad_norm": 0.31507089734077454, "learning_rate": 5.817778093470486e-06, "loss": 0.0242, "step": 31580 }, { "epoch": 8.265358194308146, "grad_norm": 0.4155109226703644, "learning_rate": 5.804529989303301e-06, "loss": 0.0223, "step": 31590 }, { "epoch": 8.267975139025188, "grad_norm": 0.23790472745895386, "learning_rate": 5.7912950050588725e-06, "loss": 0.0217, "step": 31600 }, { "epoch": 8.270592083742232, "grad_norm": 0.20042049884796143, "learning_rate": 5.778073149783172e-06, "loss": 0.02, "step": 31610 }, { "epoch": 8.273209028459274, "grad_norm": 0.272510290145874, "learning_rate": 5.764864432513226e-06, "loss": 0.028, "step": 31620 }, { "epoch": 8.275825973176318, "grad_norm": 0.3318398892879486, "learning_rate": 5.75166886227706e-06, "loss": 0.0256, "step": 31630 }, { "epoch": 8.27844291789336, "grad_norm": 0.26033854484558105, "learning_rate": 5.738486448093733e-06, "loss": 0.025, "step": 31640 }, { "epoch": 8.281059862610402, "grad_norm": 0.21934951841831207, "learning_rate": 5.725317198973296e-06, "loss": 0.0251, "step": 31650 }, { "epoch": 8.283676807327446, "grad_norm": 0.2376284897327423, "learning_rate": 5.712161123916795e-06, "loss": 0.0245, "step": 31660 }, { "epoch": 8.286293752044488, "grad_norm": 0.3624178469181061, "learning_rate": 5.699018231916292e-06, "loss": 0.0201, "step": 31670 }, { "epoch": 8.288910696761532, "grad_norm": 0.34957781434059143, "learning_rate": 5.685888531954831e-06, "loss": 0.0244, "step": 31680 }, { "epoch": 8.291527641478574, "grad_norm": 0.42351824045181274, "learning_rate": 5.672772033006437e-06, "loss": 0.0256, "step": 31690 }, { "epoch": 8.294144586195616, "grad_norm": 0.25669652223587036, "learning_rate": 5.65966874403612e-06, "loss": 0.0222, "step": 31700 }, { "epoch": 8.29676153091266, "grad_norm": 0.30895277857780457, "learning_rate": 5.646578673999841e-06, "loss": 0.0246, "step": 31710 }, { "epoch": 8.299378475629702, "grad_norm": 0.1788632720708847, "learning_rate": 5.6335018318445485e-06, "loss": 0.0238, "step": 31720 }, { "epoch": 8.301995420346746, "grad_norm": 0.33750054240226746, "learning_rate": 5.620438226508138e-06, "loss": 0.0243, "step": 31730 }, { "epoch": 8.304612365063788, "grad_norm": 0.2295209765434265, "learning_rate": 5.607387866919467e-06, "loss": 0.0208, "step": 31740 }, { "epoch": 8.307229309780832, "grad_norm": 0.1997835338115692, "learning_rate": 5.59435076199833e-06, "loss": 0.0255, "step": 31750 }, { "epoch": 8.309846254497874, "grad_norm": 0.17285549640655518, "learning_rate": 5.581326920655452e-06, "loss": 0.0236, "step": 31760 }, { "epoch": 8.312463199214916, "grad_norm": 0.22822895646095276, "learning_rate": 5.5683163517925215e-06, "loss": 0.0254, "step": 31770 }, { "epoch": 8.31508014393196, "grad_norm": 0.27281373739242554, "learning_rate": 5.55531906430213e-06, "loss": 0.0253, "step": 31780 }, { "epoch": 8.317697088649002, "grad_norm": 0.24460025131702423, "learning_rate": 5.542335067067808e-06, "loss": 0.0282, "step": 31790 }, { "epoch": 8.320314033366046, "grad_norm": 0.1998259276151657, "learning_rate": 5.529364368963999e-06, "loss": 0.026, "step": 31800 }, { "epoch": 8.322930978083088, "grad_norm": 0.3485002815723419, "learning_rate": 5.516406978856043e-06, "loss": 0.0197, "step": 31810 }, { "epoch": 8.325547922800132, "grad_norm": 0.24614469707012177, "learning_rate": 5.503462905600193e-06, "loss": 0.0263, "step": 31820 }, { "epoch": 8.328164867517174, "grad_norm": 0.3442598283290863, "learning_rate": 5.490532158043616e-06, "loss": 0.0294, "step": 31830 }, { "epoch": 8.330781812234216, "grad_norm": 0.20003820955753326, "learning_rate": 5.477614745024337e-06, "loss": 0.0206, "step": 31840 }, { "epoch": 8.33339875695126, "grad_norm": 0.28994110226631165, "learning_rate": 5.464710675371301e-06, "loss": 0.0254, "step": 31850 }, { "epoch": 8.336015701668302, "grad_norm": 0.2949778735637665, "learning_rate": 5.451819957904305e-06, "loss": 0.0289, "step": 31860 }, { "epoch": 8.338632646385346, "grad_norm": 0.1833481788635254, "learning_rate": 5.438942601434041e-06, "loss": 0.0233, "step": 31870 }, { "epoch": 8.341249591102388, "grad_norm": 0.2757635712623596, "learning_rate": 5.426078614762059e-06, "loss": 0.0236, "step": 31880 }, { "epoch": 8.34386653581943, "grad_norm": 0.49641239643096924, "learning_rate": 5.413228006680771e-06, "loss": 0.0279, "step": 31890 }, { "epoch": 8.346483480536474, "grad_norm": 0.159059077501297, "learning_rate": 5.400390785973455e-06, "loss": 0.0257, "step": 31900 }, { "epoch": 8.349100425253516, "grad_norm": 0.1922084391117096, "learning_rate": 5.38756696141422e-06, "loss": 0.0258, "step": 31910 }, { "epoch": 8.35171736997056, "grad_norm": 0.14421360194683075, "learning_rate": 5.3747565417680365e-06, "loss": 0.0226, "step": 31920 }, { "epoch": 8.354334314687602, "grad_norm": 0.24148830771446228, "learning_rate": 5.361959535790695e-06, "loss": 0.0197, "step": 31930 }, { "epoch": 8.356951259404646, "grad_norm": 0.3264545202255249, "learning_rate": 5.349175952228838e-06, "loss": 0.023, "step": 31940 }, { "epoch": 8.359568204121688, "grad_norm": 0.23968908190727234, "learning_rate": 5.336405799819924e-06, "loss": 0.0257, "step": 31950 }, { "epoch": 8.36218514883873, "grad_norm": 0.30460986495018005, "learning_rate": 5.323649087292226e-06, "loss": 0.0299, "step": 31960 }, { "epoch": 8.364802093555774, "grad_norm": 0.548783004283905, "learning_rate": 5.3109058233648365e-06, "loss": 0.022, "step": 31970 }, { "epoch": 8.367419038272816, "grad_norm": 0.3376639485359192, "learning_rate": 5.298176016747664e-06, "loss": 0.0277, "step": 31980 }, { "epoch": 8.37003598298986, "grad_norm": 0.2433539777994156, "learning_rate": 5.285459676141405e-06, "loss": 0.0226, "step": 31990 }, { "epoch": 8.372652927706902, "grad_norm": 0.30453231930732727, "learning_rate": 5.272756810237567e-06, "loss": 0.0214, "step": 32000 }, { "epoch": 8.372652927706902, "eval_loss": 0.02760984484014828, "eval_runtime": 9.3035, "eval_samples_per_second": 110.066, "eval_steps_per_second": 1.72, "step": 32000 } ], "logging_steps": 10, "max_steps": 40000, "num_input_tokens_seen": 0, "num_train_epochs": 11, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }