diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,77252 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 11030, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00045330915684496827, + "grad_norm": 7.529475230749862, + "learning_rate": 7.252946509519493e-08, + "loss": 1.4215, + "step": 1 + }, + { + "epoch": 0.0009066183136899365, + "grad_norm": 7.593120325509581, + "learning_rate": 1.4505893019038986e-07, + "loss": 1.4242, + "step": 2 + }, + { + "epoch": 0.0013599274705349048, + "grad_norm": 7.532775709452766, + "learning_rate": 2.175883952855848e-07, + "loss": 1.4193, + "step": 3 + }, + { + "epoch": 0.001813236627379873, + "grad_norm": 7.480629522246691, + "learning_rate": 2.901178603807797e-07, + "loss": 1.4377, + "step": 4 + }, + { + "epoch": 0.0022665457842248413, + "grad_norm": 7.533367455654741, + "learning_rate": 3.626473254759746e-07, + "loss": 1.424, + "step": 5 + }, + { + "epoch": 0.0027198549410698096, + "grad_norm": 7.503278620384197, + "learning_rate": 4.351767905711696e-07, + "loss": 1.4218, + "step": 6 + }, + { + "epoch": 0.003173164097914778, + "grad_norm": 7.414722368465223, + "learning_rate": 5.077062556663645e-07, + "loss": 1.4318, + "step": 7 + }, + { + "epoch": 0.003626473254759746, + "grad_norm": 7.392160161716042, + "learning_rate": 5.802357207615594e-07, + "loss": 1.4235, + "step": 8 + }, + { + "epoch": 0.004079782411604714, + "grad_norm": 7.091971499091691, + "learning_rate": 6.527651858567544e-07, + "loss": 1.4262, + "step": 9 + }, + { + "epoch": 0.004533091568449683, + "grad_norm": 6.927167730241256, + "learning_rate": 7.252946509519492e-07, + "loss": 1.4008, + "step": 10 + }, + { + "epoch": 0.004986400725294651, + "grad_norm": 6.872660690947453, + "learning_rate": 7.978241160471442e-07, + "loss": 1.4105, + "step": 11 + }, + { + "epoch": 0.005439709882139619, + "grad_norm": 5.825265694236906, + "learning_rate": 8.703535811423392e-07, + "loss": 1.3582, + "step": 12 + }, + { + "epoch": 0.0058930190389845875, + "grad_norm": 5.69185883661227, + "learning_rate": 9.428830462375341e-07, + "loss": 1.3446, + "step": 13 + }, + { + "epoch": 0.006346328195829556, + "grad_norm": 5.686582467137656, + "learning_rate": 1.015412511332729e-06, + "loss": 1.3501, + "step": 14 + }, + { + "epoch": 0.006799637352674524, + "grad_norm": 5.397470900512968, + "learning_rate": 1.087941976427924e-06, + "loss": 1.3717, + "step": 15 + }, + { + "epoch": 0.007252946509519492, + "grad_norm": 4.265133635491454, + "learning_rate": 1.1604714415231189e-06, + "loss": 1.3252, + "step": 16 + }, + { + "epoch": 0.0077062556663644605, + "grad_norm": 3.0568764658023695, + "learning_rate": 1.2330009066183137e-06, + "loss": 1.3057, + "step": 17 + }, + { + "epoch": 0.008159564823209429, + "grad_norm": 2.982365453971247, + "learning_rate": 1.3055303717135088e-06, + "loss": 1.3353, + "step": 18 + }, + { + "epoch": 0.008612873980054397, + "grad_norm": 2.876089762919186, + "learning_rate": 1.3780598368087036e-06, + "loss": 1.3148, + "step": 19 + }, + { + "epoch": 0.009066183136899365, + "grad_norm": 2.479210068210551, + "learning_rate": 1.4505893019038985e-06, + "loss": 1.3243, + "step": 20 + }, + { + "epoch": 0.009519492293744334, + "grad_norm": 2.370547996501153, + "learning_rate": 1.5231187669990935e-06, + "loss": 1.296, + "step": 21 + }, + { + "epoch": 0.009972801450589302, + "grad_norm": 2.4189834603812606, + "learning_rate": 1.5956482320942884e-06, + "loss": 1.283, + "step": 22 + }, + { + "epoch": 0.01042611060743427, + "grad_norm": 4.275657305949915, + "learning_rate": 1.6681776971894835e-06, + "loss": 1.2747, + "step": 23 + }, + { + "epoch": 0.010879419764279238, + "grad_norm": 4.732041812425561, + "learning_rate": 1.7407071622846783e-06, + "loss": 1.2956, + "step": 24 + }, + { + "epoch": 0.011332728921124207, + "grad_norm": 4.927270309461219, + "learning_rate": 1.8132366273798732e-06, + "loss": 1.3214, + "step": 25 + }, + { + "epoch": 0.011786038077969175, + "grad_norm": 4.56380071180813, + "learning_rate": 1.8857660924750682e-06, + "loss": 1.2854, + "step": 26 + }, + { + "epoch": 0.012239347234814143, + "grad_norm": 4.299328907403423, + "learning_rate": 1.958295557570263e-06, + "loss": 1.2559, + "step": 27 + }, + { + "epoch": 0.012692656391659111, + "grad_norm": 3.8168000380553355, + "learning_rate": 2.030825022665458e-06, + "loss": 1.2572, + "step": 28 + }, + { + "epoch": 0.01314596554850408, + "grad_norm": 3.213128102048786, + "learning_rate": 2.1033544877606528e-06, + "loss": 1.2756, + "step": 29 + }, + { + "epoch": 0.013599274705349048, + "grad_norm": 2.219329310202926, + "learning_rate": 2.175883952855848e-06, + "loss": 1.2569, + "step": 30 + }, + { + "epoch": 0.014052583862194016, + "grad_norm": 1.7568293989130694, + "learning_rate": 2.248413417951043e-06, + "loss": 1.2699, + "step": 31 + }, + { + "epoch": 0.014505893019038985, + "grad_norm": 1.6642781528914266, + "learning_rate": 2.3209428830462377e-06, + "loss": 1.2148, + "step": 32 + }, + { + "epoch": 0.014959202175883953, + "grad_norm": 1.6444882858479255, + "learning_rate": 2.3934723481414326e-06, + "loss": 1.2067, + "step": 33 + }, + { + "epoch": 0.015412511332728921, + "grad_norm": 1.444218972205397, + "learning_rate": 2.4660018132366274e-06, + "loss": 1.1976, + "step": 34 + }, + { + "epoch": 0.01586582048957389, + "grad_norm": 1.2167499792974639, + "learning_rate": 2.5385312783318227e-06, + "loss": 1.2053, + "step": 35 + }, + { + "epoch": 0.016319129646418858, + "grad_norm": 1.063596741114456, + "learning_rate": 2.6110607434270176e-06, + "loss": 1.205, + "step": 36 + }, + { + "epoch": 0.016772438803263828, + "grad_norm": 1.0019747182103695, + "learning_rate": 2.683590208522213e-06, + "loss": 1.185, + "step": 37 + }, + { + "epoch": 0.017225747960108794, + "grad_norm": 0.9229724935151243, + "learning_rate": 2.7561196736174073e-06, + "loss": 1.1722, + "step": 38 + }, + { + "epoch": 0.017679057116953764, + "grad_norm": 0.8714312092546272, + "learning_rate": 2.8286491387126025e-06, + "loss": 1.1883, + "step": 39 + }, + { + "epoch": 0.01813236627379873, + "grad_norm": 0.7943330216747643, + "learning_rate": 2.901178603807797e-06, + "loss": 1.1936, + "step": 40 + }, + { + "epoch": 0.0185856754306437, + "grad_norm": 0.7253701008160783, + "learning_rate": 2.9737080689029922e-06, + "loss": 1.1729, + "step": 41 + }, + { + "epoch": 0.019038984587488667, + "grad_norm": 0.69653499597173, + "learning_rate": 3.046237533998187e-06, + "loss": 1.1528, + "step": 42 + }, + { + "epoch": 0.019492293744333637, + "grad_norm": 0.6689308321427192, + "learning_rate": 3.1187669990933824e-06, + "loss": 1.1364, + "step": 43 + }, + { + "epoch": 0.019945602901178604, + "grad_norm": 0.6639943306036193, + "learning_rate": 3.1912964641885768e-06, + "loss": 1.1176, + "step": 44 + }, + { + "epoch": 0.020398912058023574, + "grad_norm": 0.6367525434075686, + "learning_rate": 3.263825929283772e-06, + "loss": 1.1492, + "step": 45 + }, + { + "epoch": 0.02085222121486854, + "grad_norm": 0.635669356253204, + "learning_rate": 3.336355394378967e-06, + "loss": 1.1288, + "step": 46 + }, + { + "epoch": 0.02130553037171351, + "grad_norm": 0.6442838488693255, + "learning_rate": 3.4088848594741618e-06, + "loss": 1.1523, + "step": 47 + }, + { + "epoch": 0.021758839528558477, + "grad_norm": 0.6285851181409213, + "learning_rate": 3.4814143245693566e-06, + "loss": 1.1134, + "step": 48 + }, + { + "epoch": 0.022212148685403447, + "grad_norm": 0.5319666055071556, + "learning_rate": 3.553943789664552e-06, + "loss": 1.1196, + "step": 49 + }, + { + "epoch": 0.022665457842248413, + "grad_norm": 0.44072232117257715, + "learning_rate": 3.6264732547597463e-06, + "loss": 1.1076, + "step": 50 + }, + { + "epoch": 0.023118766999093383, + "grad_norm": 0.5115195003085903, + "learning_rate": 3.6990027198549416e-06, + "loss": 1.1108, + "step": 51 + }, + { + "epoch": 0.02357207615593835, + "grad_norm": 0.5768766956470096, + "learning_rate": 3.7715321849501364e-06, + "loss": 1.1074, + "step": 52 + }, + { + "epoch": 0.02402538531278332, + "grad_norm": 0.4752090887531169, + "learning_rate": 3.844061650045331e-06, + "loss": 1.0964, + "step": 53 + }, + { + "epoch": 0.024478694469628286, + "grad_norm": 0.3990349919601281, + "learning_rate": 3.916591115140526e-06, + "loss": 1.1129, + "step": 54 + }, + { + "epoch": 0.024932003626473256, + "grad_norm": 0.44473257086688445, + "learning_rate": 3.989120580235721e-06, + "loss": 1.1191, + "step": 55 + }, + { + "epoch": 0.025385312783318223, + "grad_norm": 0.46856393244209626, + "learning_rate": 4.061650045330916e-06, + "loss": 1.112, + "step": 56 + }, + { + "epoch": 0.025838621940163193, + "grad_norm": 0.4009684218791682, + "learning_rate": 4.1341795104261115e-06, + "loss": 1.0957, + "step": 57 + }, + { + "epoch": 0.02629193109700816, + "grad_norm": 0.4385633000094962, + "learning_rate": 4.2067089755213055e-06, + "loss": 1.0889, + "step": 58 + }, + { + "epoch": 0.02674524025385313, + "grad_norm": 0.42631185708881886, + "learning_rate": 4.279238440616501e-06, + "loss": 1.0952, + "step": 59 + }, + { + "epoch": 0.027198549410698096, + "grad_norm": 0.3346718217262581, + "learning_rate": 4.351767905711696e-06, + "loss": 1.0702, + "step": 60 + }, + { + "epoch": 0.027651858567543066, + "grad_norm": 0.40613340660380465, + "learning_rate": 4.424297370806891e-06, + "loss": 1.1127, + "step": 61 + }, + { + "epoch": 0.028105167724388033, + "grad_norm": 0.3614821600947752, + "learning_rate": 4.496826835902086e-06, + "loss": 1.0921, + "step": 62 + }, + { + "epoch": 0.028558476881233003, + "grad_norm": 0.25465498353508886, + "learning_rate": 4.569356300997281e-06, + "loss": 1.0749, + "step": 63 + }, + { + "epoch": 0.02901178603807797, + "grad_norm": 0.3073199891997316, + "learning_rate": 4.6418857660924755e-06, + "loss": 1.0895, + "step": 64 + }, + { + "epoch": 0.02946509519492294, + "grad_norm": 0.3006958992766178, + "learning_rate": 4.71441523118767e-06, + "loss": 1.0852, + "step": 65 + }, + { + "epoch": 0.029918404351767906, + "grad_norm": 0.2837198992495677, + "learning_rate": 4.786944696282865e-06, + "loss": 1.061, + "step": 66 + }, + { + "epoch": 0.030371713508612876, + "grad_norm": 0.2857505959419577, + "learning_rate": 4.859474161378061e-06, + "loss": 1.0745, + "step": 67 + }, + { + "epoch": 0.030825022665457842, + "grad_norm": 0.2653991335834452, + "learning_rate": 4.932003626473255e-06, + "loss": 1.055, + "step": 68 + }, + { + "epoch": 0.03127833182230281, + "grad_norm": 0.25194464043191933, + "learning_rate": 5.00453309156845e-06, + "loss": 1.0748, + "step": 69 + }, + { + "epoch": 0.03173164097914778, + "grad_norm": 0.272630914249413, + "learning_rate": 5.077062556663645e-06, + "loss": 1.0763, + "step": 70 + }, + { + "epoch": 0.03218495013599275, + "grad_norm": 0.2527844916171606, + "learning_rate": 5.14959202175884e-06, + "loss": 1.0583, + "step": 71 + }, + { + "epoch": 0.032638259292837715, + "grad_norm": 0.24485480400333573, + "learning_rate": 5.222121486854035e-06, + "loss": 1.0374, + "step": 72 + }, + { + "epoch": 0.03309156844968268, + "grad_norm": 0.22405491451290266, + "learning_rate": 5.294650951949229e-06, + "loss": 1.0705, + "step": 73 + }, + { + "epoch": 0.033544877606527655, + "grad_norm": 0.23431603942798793, + "learning_rate": 5.367180417044426e-06, + "loss": 1.0725, + "step": 74 + }, + { + "epoch": 0.03399818676337262, + "grad_norm": 0.21663120224720253, + "learning_rate": 5.43970988213962e-06, + "loss": 1.0711, + "step": 75 + }, + { + "epoch": 0.03445149592021759, + "grad_norm": 0.22246624907104942, + "learning_rate": 5.5122393472348145e-06, + "loss": 1.0612, + "step": 76 + }, + { + "epoch": 0.034904805077062555, + "grad_norm": 0.21851808101971187, + "learning_rate": 5.584768812330009e-06, + "loss": 1.0664, + "step": 77 + }, + { + "epoch": 0.03535811423390753, + "grad_norm": 0.20227337165615802, + "learning_rate": 5.657298277425205e-06, + "loss": 1.0556, + "step": 78 + }, + { + "epoch": 0.035811423390752495, + "grad_norm": 0.25861105320060773, + "learning_rate": 5.7298277425204e-06, + "loss": 1.0437, + "step": 79 + }, + { + "epoch": 0.03626473254759746, + "grad_norm": 0.26878735990491354, + "learning_rate": 5.802357207615594e-06, + "loss": 1.0554, + "step": 80 + }, + { + "epoch": 0.03671804170444243, + "grad_norm": 0.2601458095321201, + "learning_rate": 5.874886672710789e-06, + "loss": 1.0476, + "step": 81 + }, + { + "epoch": 0.0371713508612874, + "grad_norm": 0.23420150673811044, + "learning_rate": 5.9474161378059845e-06, + "loss": 1.039, + "step": 82 + }, + { + "epoch": 0.03762466001813237, + "grad_norm": 0.23480027015431468, + "learning_rate": 6.019945602901179e-06, + "loss": 1.0429, + "step": 83 + }, + { + "epoch": 0.038077969174977334, + "grad_norm": 0.24891656391471473, + "learning_rate": 6.092475067996374e-06, + "loss": 1.0432, + "step": 84 + }, + { + "epoch": 0.0385312783318223, + "grad_norm": 0.23861585885709863, + "learning_rate": 6.165004533091569e-06, + "loss": 1.0473, + "step": 85 + }, + { + "epoch": 0.038984587488667274, + "grad_norm": 0.2322146070647115, + "learning_rate": 6.237533998186765e-06, + "loss": 1.0433, + "step": 86 + }, + { + "epoch": 0.03943789664551224, + "grad_norm": 0.21109490343434156, + "learning_rate": 6.310063463281959e-06, + "loss": 1.0463, + "step": 87 + }, + { + "epoch": 0.03989120580235721, + "grad_norm": 0.216219946579301, + "learning_rate": 6.3825929283771536e-06, + "loss": 1.0273, + "step": 88 + }, + { + "epoch": 0.040344514959202174, + "grad_norm": 0.21641766636978838, + "learning_rate": 6.455122393472348e-06, + "loss": 1.0541, + "step": 89 + }, + { + "epoch": 0.04079782411604715, + "grad_norm": 0.21008896150899378, + "learning_rate": 6.527651858567544e-06, + "loss": 1.0455, + "step": 90 + }, + { + "epoch": 0.041251133272892114, + "grad_norm": 0.23417944081803094, + "learning_rate": 6.600181323662739e-06, + "loss": 1.0303, + "step": 91 + }, + { + "epoch": 0.04170444242973708, + "grad_norm": 0.22395410726447243, + "learning_rate": 6.672710788757934e-06, + "loss": 1.0503, + "step": 92 + }, + { + "epoch": 0.04215775158658205, + "grad_norm": 0.26822216050774617, + "learning_rate": 6.745240253853128e-06, + "loss": 0.9992, + "step": 93 + }, + { + "epoch": 0.04261106074342702, + "grad_norm": 0.21999717180540845, + "learning_rate": 6.8177697189483235e-06, + "loss": 1.0432, + "step": 94 + }, + { + "epoch": 0.04306436990027199, + "grad_norm": 0.3236383930535883, + "learning_rate": 6.890299184043518e-06, + "loss": 1.0317, + "step": 95 + }, + { + "epoch": 0.043517679057116954, + "grad_norm": 0.25512225750318396, + "learning_rate": 6.962828649138713e-06, + "loss": 1.0215, + "step": 96 + }, + { + "epoch": 0.04397098821396192, + "grad_norm": 0.26450325288157267, + "learning_rate": 7.035358114233908e-06, + "loss": 1.0393, + "step": 97 + }, + { + "epoch": 0.044424297370806894, + "grad_norm": 0.24804464453064065, + "learning_rate": 7.107887579329104e-06, + "loss": 1.029, + "step": 98 + }, + { + "epoch": 0.04487760652765186, + "grad_norm": 0.2919095912577784, + "learning_rate": 7.180417044424299e-06, + "loss": 1.0368, + "step": 99 + }, + { + "epoch": 0.04533091568449683, + "grad_norm": 0.24562955224846647, + "learning_rate": 7.252946509519493e-06, + "loss": 1.0218, + "step": 100 + }, + { + "epoch": 0.04578422484134179, + "grad_norm": 0.33344637601345345, + "learning_rate": 7.3254759746146875e-06, + "loss": 1.0547, + "step": 101 + }, + { + "epoch": 0.04623753399818677, + "grad_norm": 0.321768239448474, + "learning_rate": 7.398005439709883e-06, + "loss": 1.0237, + "step": 102 + }, + { + "epoch": 0.04669084315503173, + "grad_norm": 0.24083502679003194, + "learning_rate": 7.470534904805078e-06, + "loss": 1.0195, + "step": 103 + }, + { + "epoch": 0.0471441523118767, + "grad_norm": 0.28060603553154856, + "learning_rate": 7.543064369900273e-06, + "loss": 1.0351, + "step": 104 + }, + { + "epoch": 0.047597461468721666, + "grad_norm": 0.2738387759702386, + "learning_rate": 7.615593834995467e-06, + "loss": 1.0212, + "step": 105 + }, + { + "epoch": 0.04805077062556664, + "grad_norm": 0.3116966323450892, + "learning_rate": 7.688123300090663e-06, + "loss": 1.0472, + "step": 106 + }, + { + "epoch": 0.048504079782411606, + "grad_norm": 0.27690805315664624, + "learning_rate": 7.760652765185858e-06, + "loss": 1.0135, + "step": 107 + }, + { + "epoch": 0.04895738893925657, + "grad_norm": 0.35925442687867143, + "learning_rate": 7.833182230281052e-06, + "loss": 1.024, + "step": 108 + }, + { + "epoch": 0.04941069809610154, + "grad_norm": 0.4725508242687737, + "learning_rate": 7.905711695376246e-06, + "loss": 1.0296, + "step": 109 + }, + { + "epoch": 0.04986400725294651, + "grad_norm": 0.5860799696821, + "learning_rate": 7.978241160471442e-06, + "loss": 1.0325, + "step": 110 + }, + { + "epoch": 0.05031731640979148, + "grad_norm": 0.6164419910581893, + "learning_rate": 8.050770625566638e-06, + "loss": 1.0345, + "step": 111 + }, + { + "epoch": 0.050770625566636446, + "grad_norm": 0.5174761037366001, + "learning_rate": 8.123300090661832e-06, + "loss": 1.0114, + "step": 112 + }, + { + "epoch": 0.05122393472348141, + "grad_norm": 0.3168303533663923, + "learning_rate": 8.195829555757027e-06, + "loss": 1.0336, + "step": 113 + }, + { + "epoch": 0.051677243880326386, + "grad_norm": 0.3589573933522987, + "learning_rate": 8.268359020852223e-06, + "loss": 1.0227, + "step": 114 + }, + { + "epoch": 0.05213055303717135, + "grad_norm": 0.5555446423775562, + "learning_rate": 8.340888485947417e-06, + "loss": 1.0336, + "step": 115 + }, + { + "epoch": 0.05258386219401632, + "grad_norm": 0.5298312726366445, + "learning_rate": 8.413417951042611e-06, + "loss": 1.0297, + "step": 116 + }, + { + "epoch": 0.053037171350861285, + "grad_norm": 0.45833430552891236, + "learning_rate": 8.485947416137807e-06, + "loss": 1.0175, + "step": 117 + }, + { + "epoch": 0.05349048050770626, + "grad_norm": 0.3404266340027579, + "learning_rate": 8.558476881233002e-06, + "loss": 1.0138, + "step": 118 + }, + { + "epoch": 0.053943789664551225, + "grad_norm": 0.2716326187250938, + "learning_rate": 8.631006346328196e-06, + "loss": 1.0209, + "step": 119 + }, + { + "epoch": 0.05439709882139619, + "grad_norm": 0.2169059375583515, + "learning_rate": 8.703535811423392e-06, + "loss": 1.0212, + "step": 120 + }, + { + "epoch": 0.05485040797824116, + "grad_norm": 0.2684269012519948, + "learning_rate": 8.776065276518586e-06, + "loss": 1.0082, + "step": 121 + }, + { + "epoch": 0.05530371713508613, + "grad_norm": 0.30791696302426647, + "learning_rate": 8.848594741613782e-06, + "loss": 1.0324, + "step": 122 + }, + { + "epoch": 0.0557570262919311, + "grad_norm": 0.33670968089785597, + "learning_rate": 8.921124206708976e-06, + "loss": 1.0181, + "step": 123 + }, + { + "epoch": 0.056210335448776065, + "grad_norm": 0.30814426757289026, + "learning_rate": 8.993653671804172e-06, + "loss": 1.0349, + "step": 124 + }, + { + "epoch": 0.05666364460562103, + "grad_norm": 0.2710148391522126, + "learning_rate": 9.066183136899366e-06, + "loss": 1.0403, + "step": 125 + }, + { + "epoch": 0.057116953762466005, + "grad_norm": 0.30956819531611085, + "learning_rate": 9.138712601994561e-06, + "loss": 1.0221, + "step": 126 + }, + { + "epoch": 0.05757026291931097, + "grad_norm": 0.28018797445897453, + "learning_rate": 9.211242067089757e-06, + "loss": 0.9988, + "step": 127 + }, + { + "epoch": 0.05802357207615594, + "grad_norm": 0.29807694967288845, + "learning_rate": 9.283771532184951e-06, + "loss": 1.0079, + "step": 128 + }, + { + "epoch": 0.058476881233000905, + "grad_norm": 0.3581679458803219, + "learning_rate": 9.356300997280145e-06, + "loss": 1.0088, + "step": 129 + }, + { + "epoch": 0.05893019038984588, + "grad_norm": 0.4378519762890707, + "learning_rate": 9.42883046237534e-06, + "loss": 1.0326, + "step": 130 + }, + { + "epoch": 0.059383499546690845, + "grad_norm": 0.4608327122340969, + "learning_rate": 9.501359927470536e-06, + "loss": 1.0248, + "step": 131 + }, + { + "epoch": 0.05983680870353581, + "grad_norm": 0.6226650794548556, + "learning_rate": 9.57388939256573e-06, + "loss": 1.0072, + "step": 132 + }, + { + "epoch": 0.06029011786038078, + "grad_norm": 0.8050827947500385, + "learning_rate": 9.646418857660926e-06, + "loss": 1.013, + "step": 133 + }, + { + "epoch": 0.06074342701722575, + "grad_norm": 0.7301147675463479, + "learning_rate": 9.718948322756122e-06, + "loss": 1.014, + "step": 134 + }, + { + "epoch": 0.06119673617407072, + "grad_norm": 0.6438107049965188, + "learning_rate": 9.791477787851316e-06, + "loss": 1.0017, + "step": 135 + }, + { + "epoch": 0.061650045330915684, + "grad_norm": 0.5337604175814439, + "learning_rate": 9.86400725294651e-06, + "loss": 1.0164, + "step": 136 + }, + { + "epoch": 0.06210335448776065, + "grad_norm": 0.6247160761796741, + "learning_rate": 9.936536718041705e-06, + "loss": 1.02, + "step": 137 + }, + { + "epoch": 0.06255666364460562, + "grad_norm": 0.9056770429202234, + "learning_rate": 1.00090661831369e-05, + "loss": 1.0117, + "step": 138 + }, + { + "epoch": 0.06300997280145058, + "grad_norm": 0.9037833745324745, + "learning_rate": 1.0081595648232093e-05, + "loss": 1.0253, + "step": 139 + }, + { + "epoch": 0.06346328195829556, + "grad_norm": 0.7331943366534031, + "learning_rate": 1.015412511332729e-05, + "loss": 1.0037, + "step": 140 + }, + { + "epoch": 0.06391659111514053, + "grad_norm": 0.6175146976953618, + "learning_rate": 1.0226654578422487e-05, + "loss": 1.0018, + "step": 141 + }, + { + "epoch": 0.0643699002719855, + "grad_norm": 0.7702300635686421, + "learning_rate": 1.029918404351768e-05, + "loss": 1.0247, + "step": 142 + }, + { + "epoch": 0.06482320942883046, + "grad_norm": 0.9590076569081369, + "learning_rate": 1.0371713508612875e-05, + "loss": 1.0145, + "step": 143 + }, + { + "epoch": 0.06527651858567543, + "grad_norm": 0.7708905008564033, + "learning_rate": 1.044424297370807e-05, + "loss": 1.0148, + "step": 144 + }, + { + "epoch": 0.0657298277425204, + "grad_norm": 0.42277990735301435, + "learning_rate": 1.0516772438803264e-05, + "loss": 0.99, + "step": 145 + }, + { + "epoch": 0.06618313689936536, + "grad_norm": 0.4997673663935069, + "learning_rate": 1.0589301903898458e-05, + "loss": 1.0151, + "step": 146 + }, + { + "epoch": 0.06663644605621033, + "grad_norm": 0.5779083721673532, + "learning_rate": 1.0661831368993654e-05, + "loss": 1.0145, + "step": 147 + }, + { + "epoch": 0.06708975521305531, + "grad_norm": 0.5388433807596953, + "learning_rate": 1.0734360834088851e-05, + "loss": 1.0177, + "step": 148 + }, + { + "epoch": 0.06754306436990028, + "grad_norm": 0.5567911105247827, + "learning_rate": 1.0806890299184045e-05, + "loss": 1.0088, + "step": 149 + }, + { + "epoch": 0.06799637352674524, + "grad_norm": 0.506075219557083, + "learning_rate": 1.087941976427924e-05, + "loss": 1.0219, + "step": 150 + }, + { + "epoch": 0.06844968268359021, + "grad_norm": 0.5889903383048785, + "learning_rate": 1.0951949229374435e-05, + "loss": 0.9782, + "step": 151 + }, + { + "epoch": 0.06890299184043518, + "grad_norm": 0.5350192314416039, + "learning_rate": 1.1024478694469629e-05, + "loss": 1.0064, + "step": 152 + }, + { + "epoch": 0.06935630099728014, + "grad_norm": 0.5303676810012745, + "learning_rate": 1.1097008159564823e-05, + "loss": 1.0204, + "step": 153 + }, + { + "epoch": 0.06980961015412511, + "grad_norm": 0.5079071512985294, + "learning_rate": 1.1169537624660019e-05, + "loss": 1.005, + "step": 154 + }, + { + "epoch": 0.07026291931097008, + "grad_norm": 0.47473225387067336, + "learning_rate": 1.1242067089755213e-05, + "loss": 1.0377, + "step": 155 + }, + { + "epoch": 0.07071622846781506, + "grad_norm": 0.42462480410500053, + "learning_rate": 1.131459655485041e-05, + "loss": 0.9941, + "step": 156 + }, + { + "epoch": 0.07116953762466002, + "grad_norm": 0.35964292572208717, + "learning_rate": 1.1387126019945604e-05, + "loss": 1.0001, + "step": 157 + }, + { + "epoch": 0.07162284678150499, + "grad_norm": 0.39773833837524003, + "learning_rate": 1.14596554850408e-05, + "loss": 0.992, + "step": 158 + }, + { + "epoch": 0.07207615593834996, + "grad_norm": 0.479362738779813, + "learning_rate": 1.1532184950135994e-05, + "loss": 0.987, + "step": 159 + }, + { + "epoch": 0.07252946509519492, + "grad_norm": 0.6632019070593157, + "learning_rate": 1.1604714415231188e-05, + "loss": 0.9943, + "step": 160 + }, + { + "epoch": 0.07298277425203989, + "grad_norm": 0.9541332299472309, + "learning_rate": 1.1677243880326384e-05, + "loss": 0.9834, + "step": 161 + }, + { + "epoch": 0.07343608340888486, + "grad_norm": 1.1416313556087756, + "learning_rate": 1.1749773345421578e-05, + "loss": 1.0205, + "step": 162 + }, + { + "epoch": 0.07388939256572982, + "grad_norm": 0.7004538190727048, + "learning_rate": 1.1822302810516773e-05, + "loss": 1.0019, + "step": 163 + }, + { + "epoch": 0.0743427017225748, + "grad_norm": 0.6050101670133614, + "learning_rate": 1.1894832275611969e-05, + "loss": 0.9974, + "step": 164 + }, + { + "epoch": 0.07479601087941977, + "grad_norm": 0.8872309721836288, + "learning_rate": 1.1967361740707165e-05, + "loss": 1.0094, + "step": 165 + }, + { + "epoch": 0.07524932003626474, + "grad_norm": 1.07492902627576, + "learning_rate": 1.2039891205802359e-05, + "loss": 1.0086, + "step": 166 + }, + { + "epoch": 0.0757026291931097, + "grad_norm": 0.85153474556294, + "learning_rate": 1.2112420670897553e-05, + "loss": 0.9942, + "step": 167 + }, + { + "epoch": 0.07615593834995467, + "grad_norm": 0.8083358228009153, + "learning_rate": 1.2184950135992748e-05, + "loss": 1.0054, + "step": 168 + }, + { + "epoch": 0.07660924750679964, + "grad_norm": 0.8246194549076243, + "learning_rate": 1.2257479601087942e-05, + "loss": 1.0078, + "step": 169 + }, + { + "epoch": 0.0770625566636446, + "grad_norm": 0.6179745477622758, + "learning_rate": 1.2330009066183138e-05, + "loss": 0.9781, + "step": 170 + }, + { + "epoch": 0.07751586582048957, + "grad_norm": 0.8043698107366198, + "learning_rate": 1.2402538531278332e-05, + "loss": 1.0099, + "step": 171 + }, + { + "epoch": 0.07796917497733455, + "grad_norm": 0.9066933771301231, + "learning_rate": 1.247506799637353e-05, + "loss": 0.9886, + "step": 172 + }, + { + "epoch": 0.07842248413417952, + "grad_norm": 0.6347094122637169, + "learning_rate": 1.2547597461468723e-05, + "loss": 0.9943, + "step": 173 + }, + { + "epoch": 0.07887579329102448, + "grad_norm": 0.6365785592681293, + "learning_rate": 1.2620126926563917e-05, + "loss": 0.9873, + "step": 174 + }, + { + "epoch": 0.07932910244786945, + "grad_norm": 0.7711016299374871, + "learning_rate": 1.2692656391659113e-05, + "loss": 0.9857, + "step": 175 + }, + { + "epoch": 0.07978241160471441, + "grad_norm": 0.7236955138393364, + "learning_rate": 1.2765185856754307e-05, + "loss": 1.0181, + "step": 176 + }, + { + "epoch": 0.08023572076155938, + "grad_norm": 0.9003477864097517, + "learning_rate": 1.2837715321849503e-05, + "loss": 1.0146, + "step": 177 + }, + { + "epoch": 0.08068902991840435, + "grad_norm": 1.1278349840855353, + "learning_rate": 1.2910244786944697e-05, + "loss": 0.9764, + "step": 178 + }, + { + "epoch": 0.08114233907524931, + "grad_norm": 0.6811372412745094, + "learning_rate": 1.298277425203989e-05, + "loss": 0.9956, + "step": 179 + }, + { + "epoch": 0.0815956482320943, + "grad_norm": 0.5658465562911695, + "learning_rate": 1.3055303717135088e-05, + "loss": 0.9769, + "step": 180 + }, + { + "epoch": 0.08204895738893926, + "grad_norm": 0.7869886779910564, + "learning_rate": 1.3127833182230282e-05, + "loss": 1.0278, + "step": 181 + }, + { + "epoch": 0.08250226654578423, + "grad_norm": 0.8547891846663555, + "learning_rate": 1.3200362647325478e-05, + "loss": 0.999, + "step": 182 + }, + { + "epoch": 0.0829555757026292, + "grad_norm": 1.066852201180377, + "learning_rate": 1.3272892112420672e-05, + "loss": 1.007, + "step": 183 + }, + { + "epoch": 0.08340888485947416, + "grad_norm": 0.8352376405215245, + "learning_rate": 1.3345421577515868e-05, + "loss": 0.9839, + "step": 184 + }, + { + "epoch": 0.08386219401631913, + "grad_norm": 0.5699671212503709, + "learning_rate": 1.3417951042611062e-05, + "loss": 0.996, + "step": 185 + }, + { + "epoch": 0.0843155031731641, + "grad_norm": 0.9273513907060452, + "learning_rate": 1.3490480507706256e-05, + "loss": 0.9951, + "step": 186 + }, + { + "epoch": 0.08476881233000906, + "grad_norm": 1.0662364604979808, + "learning_rate": 1.3563009972801451e-05, + "loss": 1.0038, + "step": 187 + }, + { + "epoch": 0.08522212148685404, + "grad_norm": 0.8284705529549362, + "learning_rate": 1.3635539437896647e-05, + "loss": 1.0117, + "step": 188 + }, + { + "epoch": 0.08567543064369901, + "grad_norm": 0.5995211711330223, + "learning_rate": 1.3708068902991843e-05, + "loss": 0.9843, + "step": 189 + }, + { + "epoch": 0.08612873980054397, + "grad_norm": 0.5064728784890009, + "learning_rate": 1.3780598368087037e-05, + "loss": 0.9806, + "step": 190 + }, + { + "epoch": 0.08658204895738894, + "grad_norm": 0.9197902804616978, + "learning_rate": 1.3853127833182232e-05, + "loss": 0.9975, + "step": 191 + }, + { + "epoch": 0.08703535811423391, + "grad_norm": 0.9641847114468183, + "learning_rate": 1.3925657298277426e-05, + "loss": 0.9976, + "step": 192 + }, + { + "epoch": 0.08748866727107887, + "grad_norm": 0.799915175155661, + "learning_rate": 1.399818676337262e-05, + "loss": 0.9962, + "step": 193 + }, + { + "epoch": 0.08794197642792384, + "grad_norm": 0.648345510976243, + "learning_rate": 1.4070716228467816e-05, + "loss": 0.9752, + "step": 194 + }, + { + "epoch": 0.0883952855847688, + "grad_norm": 0.6423807015901696, + "learning_rate": 1.414324569356301e-05, + "loss": 0.9731, + "step": 195 + }, + { + "epoch": 0.08884859474161379, + "grad_norm": 1.1101079338102626, + "learning_rate": 1.4215775158658207e-05, + "loss": 0.9768, + "step": 196 + }, + { + "epoch": 0.08930190389845875, + "grad_norm": 1.5007306788581445, + "learning_rate": 1.4288304623753401e-05, + "loss": 0.9954, + "step": 197 + }, + { + "epoch": 0.08975521305530372, + "grad_norm": 0.42375501441243135, + "learning_rate": 1.4360834088848597e-05, + "loss": 0.996, + "step": 198 + }, + { + "epoch": 0.09020852221214869, + "grad_norm": 1.4560313524567314, + "learning_rate": 1.4433363553943791e-05, + "loss": 1.0033, + "step": 199 + }, + { + "epoch": 0.09066183136899365, + "grad_norm": 1.1422298759333185, + "learning_rate": 1.4505893019038985e-05, + "loss": 0.9744, + "step": 200 + }, + { + "epoch": 0.09111514052583862, + "grad_norm": 0.7038514687452473, + "learning_rate": 1.4578422484134181e-05, + "loss": 0.997, + "step": 201 + }, + { + "epoch": 0.09156844968268359, + "grad_norm": 1.1676043384874575, + "learning_rate": 1.4650951949229375e-05, + "loss": 0.9795, + "step": 202 + }, + { + "epoch": 0.09202175883952855, + "grad_norm": 1.1845334269071475, + "learning_rate": 1.4723481414324569e-05, + "loss": 0.9782, + "step": 203 + }, + { + "epoch": 0.09247506799637353, + "grad_norm": 0.8555969826316577, + "learning_rate": 1.4796010879419766e-05, + "loss": 0.9908, + "step": 204 + }, + { + "epoch": 0.0929283771532185, + "grad_norm": 0.930955148540943, + "learning_rate": 1.4868540344514962e-05, + "loss": 0.9693, + "step": 205 + }, + { + "epoch": 0.09338168631006347, + "grad_norm": 0.824673597329167, + "learning_rate": 1.4941069809610156e-05, + "loss": 1.0046, + "step": 206 + }, + { + "epoch": 0.09383499546690843, + "grad_norm": 0.9628763437161365, + "learning_rate": 1.501359927470535e-05, + "loss": 0.9728, + "step": 207 + }, + { + "epoch": 0.0942883046237534, + "grad_norm": 1.124075851760181, + "learning_rate": 1.5086128739800546e-05, + "loss": 0.9887, + "step": 208 + }, + { + "epoch": 0.09474161378059837, + "grad_norm": 1.0020541511811916, + "learning_rate": 1.515865820489574e-05, + "loss": 0.9742, + "step": 209 + }, + { + "epoch": 0.09519492293744333, + "grad_norm": 1.428557914150642, + "learning_rate": 1.5231187669990934e-05, + "loss": 0.9868, + "step": 210 + }, + { + "epoch": 0.0956482320942883, + "grad_norm": 0.4816310505349378, + "learning_rate": 1.530371713508613e-05, + "loss": 0.9689, + "step": 211 + }, + { + "epoch": 0.09610154125113328, + "grad_norm": 1.350677712085829, + "learning_rate": 1.5376246600181325e-05, + "loss": 0.9773, + "step": 212 + }, + { + "epoch": 0.09655485040797825, + "grad_norm": 0.8961976530458231, + "learning_rate": 1.544877606527652e-05, + "loss": 0.9582, + "step": 213 + }, + { + "epoch": 0.09700815956482321, + "grad_norm": 0.8280523544487235, + "learning_rate": 1.5521305530371716e-05, + "loss": 1.0079, + "step": 214 + }, + { + "epoch": 0.09746146872166818, + "grad_norm": 1.1733044914183084, + "learning_rate": 1.559383499546691e-05, + "loss": 0.9795, + "step": 215 + }, + { + "epoch": 0.09791477787851315, + "grad_norm": 1.014229491267023, + "learning_rate": 1.5666364460562104e-05, + "loss": 0.9773, + "step": 216 + }, + { + "epoch": 0.09836808703535811, + "grad_norm": 1.1072318673235464, + "learning_rate": 1.57388939256573e-05, + "loss": 0.9792, + "step": 217 + }, + { + "epoch": 0.09882139619220308, + "grad_norm": 0.60564072292483, + "learning_rate": 1.5811423390752492e-05, + "loss": 0.993, + "step": 218 + }, + { + "epoch": 0.09927470534904805, + "grad_norm": 0.920081494826898, + "learning_rate": 1.5883952855847688e-05, + "loss": 0.9778, + "step": 219 + }, + { + "epoch": 0.09972801450589303, + "grad_norm": 1.2326616239964998, + "learning_rate": 1.5956482320942884e-05, + "loss": 0.979, + "step": 220 + }, + { + "epoch": 0.10018132366273799, + "grad_norm": 0.8094055421302522, + "learning_rate": 1.602901178603808e-05, + "loss": 0.9739, + "step": 221 + }, + { + "epoch": 0.10063463281958296, + "grad_norm": 1.0843736866199611, + "learning_rate": 1.6101541251133275e-05, + "loss": 0.9846, + "step": 222 + }, + { + "epoch": 0.10108794197642793, + "grad_norm": 0.8290507828181226, + "learning_rate": 1.617407071622847e-05, + "loss": 0.9699, + "step": 223 + }, + { + "epoch": 0.10154125113327289, + "grad_norm": 0.8769177830467694, + "learning_rate": 1.6246600181323663e-05, + "loss": 1.0043, + "step": 224 + }, + { + "epoch": 0.10199456029011786, + "grad_norm": 0.9536416098834379, + "learning_rate": 1.631912964641886e-05, + "loss": 1.0005, + "step": 225 + }, + { + "epoch": 0.10244786944696282, + "grad_norm": 0.7300608573017586, + "learning_rate": 1.6391659111514055e-05, + "loss": 0.9705, + "step": 226 + }, + { + "epoch": 0.10290117860380779, + "grad_norm": 0.763092364228021, + "learning_rate": 1.6464188576609247e-05, + "loss": 0.9785, + "step": 227 + }, + { + "epoch": 0.10335448776065277, + "grad_norm": 0.8079383183643218, + "learning_rate": 1.6536718041704446e-05, + "loss": 0.9923, + "step": 228 + }, + { + "epoch": 0.10380779691749774, + "grad_norm": 0.8517826259405005, + "learning_rate": 1.660924750679964e-05, + "loss": 0.9818, + "step": 229 + }, + { + "epoch": 0.1042611060743427, + "grad_norm": 0.8998822996749852, + "learning_rate": 1.6681776971894834e-05, + "loss": 0.9849, + "step": 230 + }, + { + "epoch": 0.10471441523118767, + "grad_norm": 1.2408709545670018, + "learning_rate": 1.675430643699003e-05, + "loss": 0.9742, + "step": 231 + }, + { + "epoch": 0.10516772438803264, + "grad_norm": 0.9556970521991514, + "learning_rate": 1.6826835902085222e-05, + "loss": 0.9904, + "step": 232 + }, + { + "epoch": 0.1056210335448776, + "grad_norm": 0.812130685400505, + "learning_rate": 1.6899365367180418e-05, + "loss": 0.9818, + "step": 233 + }, + { + "epoch": 0.10607434270172257, + "grad_norm": 0.4449645505521567, + "learning_rate": 1.6971894832275613e-05, + "loss": 0.9902, + "step": 234 + }, + { + "epoch": 0.10652765185856754, + "grad_norm": 0.7252411381260219, + "learning_rate": 1.7044424297370806e-05, + "loss": 0.9667, + "step": 235 + }, + { + "epoch": 0.10698096101541252, + "grad_norm": 0.9295651078163392, + "learning_rate": 1.7116953762466005e-05, + "loss": 0.9821, + "step": 236 + }, + { + "epoch": 0.10743427017225748, + "grad_norm": 0.9572789366170193, + "learning_rate": 1.71894832275612e-05, + "loss": 0.9636, + "step": 237 + }, + { + "epoch": 0.10788757932910245, + "grad_norm": 0.9985681552390233, + "learning_rate": 1.7262012692656393e-05, + "loss": 0.9577, + "step": 238 + }, + { + "epoch": 0.10834088848594742, + "grad_norm": 1.2711800486169227, + "learning_rate": 1.733454215775159e-05, + "loss": 0.9807, + "step": 239 + }, + { + "epoch": 0.10879419764279238, + "grad_norm": 0.6670602415088438, + "learning_rate": 1.7407071622846784e-05, + "loss": 0.9862, + "step": 240 + }, + { + "epoch": 0.10924750679963735, + "grad_norm": 0.8376552771623001, + "learning_rate": 1.7479601087941977e-05, + "loss": 0.9601, + "step": 241 + }, + { + "epoch": 0.10970081595648232, + "grad_norm": 1.343570555732653, + "learning_rate": 1.7552130553037172e-05, + "loss": 0.9864, + "step": 242 + }, + { + "epoch": 0.11015412511332728, + "grad_norm": 0.735222859876767, + "learning_rate": 1.7624660018132368e-05, + "loss": 0.9788, + "step": 243 + }, + { + "epoch": 0.11060743427017226, + "grad_norm": 1.1559965308172442, + "learning_rate": 1.7697189483227564e-05, + "loss": 0.9786, + "step": 244 + }, + { + "epoch": 0.11106074342701723, + "grad_norm": 1.2826780864038205, + "learning_rate": 1.776971894832276e-05, + "loss": 0.9881, + "step": 245 + }, + { + "epoch": 0.1115140525838622, + "grad_norm": 0.7814106001336353, + "learning_rate": 1.784224841341795e-05, + "loss": 0.9544, + "step": 246 + }, + { + "epoch": 0.11196736174070716, + "grad_norm": 1.6537276432939085, + "learning_rate": 1.7914777878513147e-05, + "loss": 0.9917, + "step": 247 + }, + { + "epoch": 0.11242067089755213, + "grad_norm": 0.6893437570159018, + "learning_rate": 1.7987307343608343e-05, + "loss": 1.003, + "step": 248 + }, + { + "epoch": 0.1128739800543971, + "grad_norm": 1.6849725670625908, + "learning_rate": 1.8059836808703535e-05, + "loss": 0.9828, + "step": 249 + }, + { + "epoch": 0.11332728921124206, + "grad_norm": 1.0060674291542064, + "learning_rate": 1.813236627379873e-05, + "loss": 0.9615, + "step": 250 + }, + { + "epoch": 0.11378059836808703, + "grad_norm": 2.0804817610239903, + "learning_rate": 1.8204895738893927e-05, + "loss": 0.9939, + "step": 251 + }, + { + "epoch": 0.11423390752493201, + "grad_norm": 1.9634789743524923, + "learning_rate": 1.8277425203989122e-05, + "loss": 0.9896, + "step": 252 + }, + { + "epoch": 0.11468721668177698, + "grad_norm": 1.1649779335436878, + "learning_rate": 1.8349954669084318e-05, + "loss": 1.0184, + "step": 253 + }, + { + "epoch": 0.11514052583862194, + "grad_norm": 1.5843507590428396, + "learning_rate": 1.8422484134179514e-05, + "loss": 0.9832, + "step": 254 + }, + { + "epoch": 0.11559383499546691, + "grad_norm": 1.0529136255633502, + "learning_rate": 1.8495013599274706e-05, + "loss": 0.9868, + "step": 255 + }, + { + "epoch": 0.11604714415231188, + "grad_norm": 1.887089900140882, + "learning_rate": 1.8567543064369902e-05, + "loss": 0.9755, + "step": 256 + }, + { + "epoch": 0.11650045330915684, + "grad_norm": 1.5891049260652443, + "learning_rate": 1.8640072529465098e-05, + "loss": 0.9647, + "step": 257 + }, + { + "epoch": 0.11695376246600181, + "grad_norm": 1.2323878574116947, + "learning_rate": 1.871260199456029e-05, + "loss": 0.988, + "step": 258 + }, + { + "epoch": 0.11740707162284678, + "grad_norm": 1.4017061386902538, + "learning_rate": 1.8785131459655486e-05, + "loss": 0.959, + "step": 259 + }, + { + "epoch": 0.11786038077969176, + "grad_norm": 0.9841075490698694, + "learning_rate": 1.885766092475068e-05, + "loss": 0.975, + "step": 260 + }, + { + "epoch": 0.11831368993653672, + "grad_norm": 1.425459904030462, + "learning_rate": 1.8930190389845877e-05, + "loss": 0.9987, + "step": 261 + }, + { + "epoch": 0.11876699909338169, + "grad_norm": 1.0006046414278666, + "learning_rate": 1.9002719854941073e-05, + "loss": 0.9618, + "step": 262 + }, + { + "epoch": 0.11922030825022666, + "grad_norm": 1.463377849964184, + "learning_rate": 1.9075249320036265e-05, + "loss": 0.9887, + "step": 263 + }, + { + "epoch": 0.11967361740707162, + "grad_norm": 1.2191563633199565, + "learning_rate": 1.914777878513146e-05, + "loss": 0.9855, + "step": 264 + }, + { + "epoch": 0.12012692656391659, + "grad_norm": 1.2698795528056874, + "learning_rate": 1.9220308250226656e-05, + "loss": 0.9757, + "step": 265 + }, + { + "epoch": 0.12058023572076156, + "grad_norm": 1.4800822421542457, + "learning_rate": 1.9292837715321852e-05, + "loss": 0.9955, + "step": 266 + }, + { + "epoch": 0.12103354487760652, + "grad_norm": 1.2941761703168384, + "learning_rate": 1.9365367180417044e-05, + "loss": 0.9769, + "step": 267 + }, + { + "epoch": 0.1214868540344515, + "grad_norm": 1.137685934242054, + "learning_rate": 1.9437896645512243e-05, + "loss": 0.9638, + "step": 268 + }, + { + "epoch": 0.12194016319129647, + "grad_norm": 0.9786061476311878, + "learning_rate": 1.9510426110607436e-05, + "loss": 0.9866, + "step": 269 + }, + { + "epoch": 0.12239347234814144, + "grad_norm": 1.14610021536446, + "learning_rate": 1.958295557570263e-05, + "loss": 0.9424, + "step": 270 + }, + { + "epoch": 0.1228467815049864, + "grad_norm": 1.2594427817814677, + "learning_rate": 1.9655485040797827e-05, + "loss": 0.9687, + "step": 271 + }, + { + "epoch": 0.12330009066183137, + "grad_norm": 0.8612758658610435, + "learning_rate": 1.972801450589302e-05, + "loss": 0.9848, + "step": 272 + }, + { + "epoch": 0.12375339981867634, + "grad_norm": 0.8756591780106714, + "learning_rate": 1.9800543970988215e-05, + "loss": 0.9605, + "step": 273 + }, + { + "epoch": 0.1242067089755213, + "grad_norm": 0.7538940670576584, + "learning_rate": 1.987307343608341e-05, + "loss": 0.9873, + "step": 274 + }, + { + "epoch": 0.12466001813236627, + "grad_norm": 0.9105521249427928, + "learning_rate": 1.9945602901178603e-05, + "loss": 0.9761, + "step": 275 + }, + { + "epoch": 0.12511332728921123, + "grad_norm": 1.2082970794151782, + "learning_rate": 2.00181323662738e-05, + "loss": 0.9647, + "step": 276 + }, + { + "epoch": 0.1255666364460562, + "grad_norm": 1.2276525054309486, + "learning_rate": 2.0090661831368995e-05, + "loss": 0.9652, + "step": 277 + }, + { + "epoch": 0.12601994560290117, + "grad_norm": 0.9486143367623325, + "learning_rate": 2.0163191296464187e-05, + "loss": 0.9894, + "step": 278 + }, + { + "epoch": 0.12647325475974613, + "grad_norm": 1.0961709262304196, + "learning_rate": 2.0235720761559383e-05, + "loss": 0.9903, + "step": 279 + }, + { + "epoch": 0.12692656391659113, + "grad_norm": 1.0835334523571587, + "learning_rate": 2.030825022665458e-05, + "loss": 0.9718, + "step": 280 + }, + { + "epoch": 0.1273798730734361, + "grad_norm": 1.5108390987852842, + "learning_rate": 2.0380779691749777e-05, + "loss": 0.9911, + "step": 281 + }, + { + "epoch": 0.12783318223028106, + "grad_norm": 0.6959667630768904, + "learning_rate": 2.0453309156844973e-05, + "loss": 0.9889, + "step": 282 + }, + { + "epoch": 0.12828649138712603, + "grad_norm": 1.1060822358941296, + "learning_rate": 2.0525838621940165e-05, + "loss": 0.9748, + "step": 283 + }, + { + "epoch": 0.128739800543971, + "grad_norm": 1.7081303899139821, + "learning_rate": 2.059836808703536e-05, + "loss": 0.9791, + "step": 284 + }, + { + "epoch": 0.12919310970081596, + "grad_norm": 0.6993825425452259, + "learning_rate": 2.0670897552130557e-05, + "loss": 0.9776, + "step": 285 + }, + { + "epoch": 0.12964641885766093, + "grad_norm": 2.0172542972716916, + "learning_rate": 2.074342701722575e-05, + "loss": 0.9558, + "step": 286 + }, + { + "epoch": 0.1300997280145059, + "grad_norm": 1.126456177113453, + "learning_rate": 2.0815956482320945e-05, + "loss": 0.9888, + "step": 287 + }, + { + "epoch": 0.13055303717135086, + "grad_norm": 2.127455302311884, + "learning_rate": 2.088848594741614e-05, + "loss": 0.9933, + "step": 288 + }, + { + "epoch": 0.13100634632819583, + "grad_norm": 1.693138414413347, + "learning_rate": 2.0961015412511333e-05, + "loss": 0.983, + "step": 289 + }, + { + "epoch": 0.1314596554850408, + "grad_norm": 1.6609776648326933, + "learning_rate": 2.103354487760653e-05, + "loss": 0.9613, + "step": 290 + }, + { + "epoch": 0.13191296464188576, + "grad_norm": 1.6803908880826948, + "learning_rate": 2.1106074342701724e-05, + "loss": 0.9858, + "step": 291 + }, + { + "epoch": 0.13236627379873073, + "grad_norm": 1.3168606047561777, + "learning_rate": 2.1178603807796916e-05, + "loss": 0.989, + "step": 292 + }, + { + "epoch": 0.1328195829555757, + "grad_norm": 1.4157167392194014, + "learning_rate": 2.1251133272892112e-05, + "loss": 0.9664, + "step": 293 + }, + { + "epoch": 0.13327289211242066, + "grad_norm": 1.1724638158572707, + "learning_rate": 2.1323662737987308e-05, + "loss": 0.9722, + "step": 294 + }, + { + "epoch": 0.13372620126926563, + "grad_norm": 1.506810988004093, + "learning_rate": 2.1396192203082504e-05, + "loss": 0.9827, + "step": 295 + }, + { + "epoch": 0.13417951042611062, + "grad_norm": 1.1259386386552064, + "learning_rate": 2.1468721668177703e-05, + "loss": 0.9659, + "step": 296 + }, + { + "epoch": 0.1346328195829556, + "grad_norm": 1.6238681862761164, + "learning_rate": 2.1541251133272895e-05, + "loss": 0.983, + "step": 297 + }, + { + "epoch": 0.13508612873980055, + "grad_norm": 1.5296050415433289, + "learning_rate": 2.161378059836809e-05, + "loss": 0.9803, + "step": 298 + }, + { + "epoch": 0.13553943789664552, + "grad_norm": 1.2091591954175966, + "learning_rate": 2.1686310063463286e-05, + "loss": 0.9696, + "step": 299 + }, + { + "epoch": 0.1359927470534905, + "grad_norm": 1.4703297260596262, + "learning_rate": 2.175883952855848e-05, + "loss": 0.9707, + "step": 300 + }, + { + "epoch": 0.13644605621033545, + "grad_norm": 1.088299707082397, + "learning_rate": 2.1831368993653674e-05, + "loss": 0.9799, + "step": 301 + }, + { + "epoch": 0.13689936536718042, + "grad_norm": 1.6130249012013766, + "learning_rate": 2.190389845874887e-05, + "loss": 0.985, + "step": 302 + }, + { + "epoch": 0.1373526745240254, + "grad_norm": 1.299180705339519, + "learning_rate": 2.1976427923844062e-05, + "loss": 0.9602, + "step": 303 + }, + { + "epoch": 0.13780598368087035, + "grad_norm": 1.3590180237916514, + "learning_rate": 2.2048957388939258e-05, + "loss": 0.9836, + "step": 304 + }, + { + "epoch": 0.13825929283771532, + "grad_norm": 1.274439141028146, + "learning_rate": 2.2121486854034454e-05, + "loss": 0.9868, + "step": 305 + }, + { + "epoch": 0.1387126019945603, + "grad_norm": 1.214938895567437, + "learning_rate": 2.2194016319129646e-05, + "loss": 0.9945, + "step": 306 + }, + { + "epoch": 0.13916591115140525, + "grad_norm": 1.0867048046166174, + "learning_rate": 2.2266545784224842e-05, + "loss": 0.9891, + "step": 307 + }, + { + "epoch": 0.13961922030825022, + "grad_norm": 1.152651322304279, + "learning_rate": 2.2339075249320037e-05, + "loss": 0.9784, + "step": 308 + }, + { + "epoch": 0.14007252946509519, + "grad_norm": 0.9335177965692247, + "learning_rate": 2.2411604714415233e-05, + "loss": 0.9822, + "step": 309 + }, + { + "epoch": 0.14052583862194015, + "grad_norm": 1.2073636881570384, + "learning_rate": 2.2484134179510425e-05, + "loss": 0.9672, + "step": 310 + }, + { + "epoch": 0.14097914777878512, + "grad_norm": 1.0938314217297167, + "learning_rate": 2.255666364460562e-05, + "loss": 0.9764, + "step": 311 + }, + { + "epoch": 0.1414324569356301, + "grad_norm": 1.0785867621298086, + "learning_rate": 2.262919310970082e-05, + "loss": 0.9597, + "step": 312 + }, + { + "epoch": 0.14188576609247508, + "grad_norm": 0.9920753641718812, + "learning_rate": 2.2701722574796016e-05, + "loss": 0.9663, + "step": 313 + }, + { + "epoch": 0.14233907524932005, + "grad_norm": 0.7983365951041685, + "learning_rate": 2.2774252039891208e-05, + "loss": 0.9667, + "step": 314 + }, + { + "epoch": 0.142792384406165, + "grad_norm": 1.146961954359942, + "learning_rate": 2.2846781504986404e-05, + "loss": 0.9607, + "step": 315 + }, + { + "epoch": 0.14324569356300998, + "grad_norm": 0.7097464715104542, + "learning_rate": 2.29193109700816e-05, + "loss": 0.9664, + "step": 316 + }, + { + "epoch": 0.14369900271985495, + "grad_norm": 1.0022962311934114, + "learning_rate": 2.2991840435176792e-05, + "loss": 0.9649, + "step": 317 + }, + { + "epoch": 0.1441523118766999, + "grad_norm": 0.9211817305255298, + "learning_rate": 2.3064369900271988e-05, + "loss": 0.9681, + "step": 318 + }, + { + "epoch": 0.14460562103354488, + "grad_norm": 1.4108296547781745, + "learning_rate": 2.3136899365367183e-05, + "loss": 0.9641, + "step": 319 + }, + { + "epoch": 0.14505893019038985, + "grad_norm": 1.1814512581491157, + "learning_rate": 2.3209428830462376e-05, + "loss": 0.9798, + "step": 320 + }, + { + "epoch": 0.1455122393472348, + "grad_norm": 1.2438981011225425, + "learning_rate": 2.328195829555757e-05, + "loss": 0.986, + "step": 321 + }, + { + "epoch": 0.14596554850407978, + "grad_norm": 0.924826557267048, + "learning_rate": 2.3354487760652767e-05, + "loss": 0.947, + "step": 322 + }, + { + "epoch": 0.14641885766092474, + "grad_norm": 1.0860833968842445, + "learning_rate": 2.3427017225747963e-05, + "loss": 0.9624, + "step": 323 + }, + { + "epoch": 0.1468721668177697, + "grad_norm": 1.0217514014564444, + "learning_rate": 2.3499546690843155e-05, + "loss": 0.977, + "step": 324 + }, + { + "epoch": 0.14732547597461468, + "grad_norm": 1.1674151382952693, + "learning_rate": 2.357207615593835e-05, + "loss": 0.9866, + "step": 325 + }, + { + "epoch": 0.14777878513145964, + "grad_norm": 1.222662801119326, + "learning_rate": 2.3644605621033546e-05, + "loss": 0.9663, + "step": 326 + }, + { + "epoch": 0.1482320942883046, + "grad_norm": 1.1876386584855878, + "learning_rate": 2.371713508612874e-05, + "loss": 0.9822, + "step": 327 + }, + { + "epoch": 0.1486854034451496, + "grad_norm": 1.7198416796464207, + "learning_rate": 2.3789664551223938e-05, + "loss": 0.9473, + "step": 328 + }, + { + "epoch": 0.14913871260199457, + "grad_norm": 0.7915454311641326, + "learning_rate": 2.3862194016319134e-05, + "loss": 0.9725, + "step": 329 + }, + { + "epoch": 0.14959202175883954, + "grad_norm": 2.3746230624113895, + "learning_rate": 2.393472348141433e-05, + "loss": 0.9798, + "step": 330 + }, + { + "epoch": 0.1500453309156845, + "grad_norm": 1.5656498130662204, + "learning_rate": 2.400725294650952e-05, + "loss": 0.9755, + "step": 331 + }, + { + "epoch": 0.15049864007252947, + "grad_norm": 2.30620570887288, + "learning_rate": 2.4079782411604717e-05, + "loss": 0.9673, + "step": 332 + }, + { + "epoch": 0.15095194922937444, + "grad_norm": 2.2084039737648116, + "learning_rate": 2.4152311876699913e-05, + "loss": 0.9932, + "step": 333 + }, + { + "epoch": 0.1514052583862194, + "grad_norm": 1.35532113659019, + "learning_rate": 2.4224841341795105e-05, + "loss": 0.9762, + "step": 334 + }, + { + "epoch": 0.15185856754306437, + "grad_norm": 1.8516249415137251, + "learning_rate": 2.42973708068903e-05, + "loss": 0.9902, + "step": 335 + }, + { + "epoch": 0.15231187669990934, + "grad_norm": 0.9894275372304367, + "learning_rate": 2.4369900271985497e-05, + "loss": 0.9772, + "step": 336 + }, + { + "epoch": 0.1527651858567543, + "grad_norm": 1.9629261005028331, + "learning_rate": 2.4442429737080692e-05, + "loss": 0.9646, + "step": 337 + }, + { + "epoch": 0.15321849501359927, + "grad_norm": 1.630421617469693, + "learning_rate": 2.4514959202175885e-05, + "loss": 0.9771, + "step": 338 + }, + { + "epoch": 0.15367180417044424, + "grad_norm": 1.8564933288855712, + "learning_rate": 2.458748866727108e-05, + "loss": 0.9571, + "step": 339 + }, + { + "epoch": 0.1541251133272892, + "grad_norm": 1.5492286378045599, + "learning_rate": 2.4660018132366276e-05, + "loss": 0.9696, + "step": 340 + }, + { + "epoch": 0.15457842248413417, + "grad_norm": 1.486217931479215, + "learning_rate": 2.473254759746147e-05, + "loss": 0.9867, + "step": 341 + }, + { + "epoch": 0.15503173164097914, + "grad_norm": 1.388439102854886, + "learning_rate": 2.4805077062556664e-05, + "loss": 0.9458, + "step": 342 + }, + { + "epoch": 0.1554850407978241, + "grad_norm": 1.2965776663451756, + "learning_rate": 2.487760652765186e-05, + "loss": 0.9591, + "step": 343 + }, + { + "epoch": 0.1559383499546691, + "grad_norm": 1.2478183763765311, + "learning_rate": 2.495013599274706e-05, + "loss": 0.9543, + "step": 344 + }, + { + "epoch": 0.15639165911151406, + "grad_norm": 0.85743320943999, + "learning_rate": 2.502266545784225e-05, + "loss": 0.982, + "step": 345 + }, + { + "epoch": 0.15684496826835903, + "grad_norm": 1.6825091253771363, + "learning_rate": 2.5095194922937447e-05, + "loss": 0.9658, + "step": 346 + }, + { + "epoch": 0.157298277425204, + "grad_norm": 1.1383786372012867, + "learning_rate": 2.5167724388032643e-05, + "loss": 0.9673, + "step": 347 + }, + { + "epoch": 0.15775158658204896, + "grad_norm": 1.2181311015413976, + "learning_rate": 2.5240253853127835e-05, + "loss": 0.975, + "step": 348 + }, + { + "epoch": 0.15820489573889393, + "grad_norm": 1.0559719089348336, + "learning_rate": 2.531278331822303e-05, + "loss": 0.9801, + "step": 349 + }, + { + "epoch": 0.1586582048957389, + "grad_norm": 1.090044494260907, + "learning_rate": 2.5385312783318226e-05, + "loss": 0.9279, + "step": 350 + }, + { + "epoch": 0.15911151405258386, + "grad_norm": 1.6691200224256781, + "learning_rate": 2.5457842248413422e-05, + "loss": 0.9802, + "step": 351 + }, + { + "epoch": 0.15956482320942883, + "grad_norm": 0.9696518951664035, + "learning_rate": 2.5530371713508614e-05, + "loss": 0.9394, + "step": 352 + }, + { + "epoch": 0.1600181323662738, + "grad_norm": 1.410040240273291, + "learning_rate": 2.560290117860381e-05, + "loss": 0.9638, + "step": 353 + }, + { + "epoch": 0.16047144152311876, + "grad_norm": 1.1460934676665542, + "learning_rate": 2.5675430643699006e-05, + "loss": 0.9613, + "step": 354 + }, + { + "epoch": 0.16092475067996373, + "grad_norm": 1.4017681591240083, + "learning_rate": 2.5747960108794198e-05, + "loss": 0.9737, + "step": 355 + }, + { + "epoch": 0.1613780598368087, + "grad_norm": 1.0099084176329762, + "learning_rate": 2.5820489573889394e-05, + "loss": 0.9812, + "step": 356 + }, + { + "epoch": 0.16183136899365366, + "grad_norm": 1.1334909486260827, + "learning_rate": 2.589301903898459e-05, + "loss": 0.9628, + "step": 357 + }, + { + "epoch": 0.16228467815049863, + "grad_norm": 1.0710329320227614, + "learning_rate": 2.596554850407978e-05, + "loss": 0.9621, + "step": 358 + }, + { + "epoch": 0.1627379873073436, + "grad_norm": 0.875927793017003, + "learning_rate": 2.6038077969174977e-05, + "loss": 0.9619, + "step": 359 + }, + { + "epoch": 0.1631912964641886, + "grad_norm": 1.0576806114027948, + "learning_rate": 2.6110607434270176e-05, + "loss": 0.978, + "step": 360 + }, + { + "epoch": 0.16364460562103356, + "grad_norm": 0.9739823039410348, + "learning_rate": 2.6183136899365372e-05, + "loss": 0.9654, + "step": 361 + }, + { + "epoch": 0.16409791477787852, + "grad_norm": 1.0171786127394473, + "learning_rate": 2.6255666364460564e-05, + "loss": 0.9782, + "step": 362 + }, + { + "epoch": 0.1645512239347235, + "grad_norm": 1.2685989662829482, + "learning_rate": 2.632819582955576e-05, + "loss": 0.9664, + "step": 363 + }, + { + "epoch": 0.16500453309156846, + "grad_norm": 1.3795232726736537, + "learning_rate": 2.6400725294650956e-05, + "loss": 0.9466, + "step": 364 + }, + { + "epoch": 0.16545784224841342, + "grad_norm": 1.143035224843098, + "learning_rate": 2.6473254759746148e-05, + "loss": 0.9764, + "step": 365 + }, + { + "epoch": 0.1659111514052584, + "grad_norm": 1.035017254464268, + "learning_rate": 2.6545784224841344e-05, + "loss": 0.9614, + "step": 366 + }, + { + "epoch": 0.16636446056210336, + "grad_norm": 1.2512499706125675, + "learning_rate": 2.661831368993654e-05, + "loss": 0.9669, + "step": 367 + }, + { + "epoch": 0.16681776971894832, + "grad_norm": 1.434821260736265, + "learning_rate": 2.6690843155031735e-05, + "loss": 0.9628, + "step": 368 + }, + { + "epoch": 0.1672710788757933, + "grad_norm": 0.9861234853952395, + "learning_rate": 2.6763372620126928e-05, + "loss": 0.9353, + "step": 369 + }, + { + "epoch": 0.16772438803263826, + "grad_norm": 1.657770515919325, + "learning_rate": 2.6835902085222123e-05, + "loss": 0.9765, + "step": 370 + }, + { + "epoch": 0.16817769718948322, + "grad_norm": 0.784806763311947, + "learning_rate": 2.690843155031732e-05, + "loss": 0.9748, + "step": 371 + }, + { + "epoch": 0.1686310063463282, + "grad_norm": 1.8103571432207946, + "learning_rate": 2.698096101541251e-05, + "loss": 0.9728, + "step": 372 + }, + { + "epoch": 0.16908431550317315, + "grad_norm": 1.2109196259265576, + "learning_rate": 2.7053490480507707e-05, + "loss": 0.9528, + "step": 373 + }, + { + "epoch": 0.16953762466001812, + "grad_norm": 1.755987961675713, + "learning_rate": 2.7126019945602903e-05, + "loss": 0.955, + "step": 374 + }, + { + "epoch": 0.1699909338168631, + "grad_norm": 1.47446086150359, + "learning_rate": 2.7198549410698095e-05, + "loss": 0.9719, + "step": 375 + }, + { + "epoch": 0.17044424297370808, + "grad_norm": 1.4309535484584595, + "learning_rate": 2.7271078875793294e-05, + "loss": 0.9636, + "step": 376 + }, + { + "epoch": 0.17089755213055305, + "grad_norm": 1.3375420224856274, + "learning_rate": 2.734360834088849e-05, + "loss": 0.9496, + "step": 377 + }, + { + "epoch": 0.17135086128739802, + "grad_norm": 1.4029269470347197, + "learning_rate": 2.7416137805983685e-05, + "loss": 0.9686, + "step": 378 + }, + { + "epoch": 0.17180417044424298, + "grad_norm": 1.1303986834868789, + "learning_rate": 2.7488667271078878e-05, + "loss": 0.9509, + "step": 379 + }, + { + "epoch": 0.17225747960108795, + "grad_norm": 1.2601602176914741, + "learning_rate": 2.7561196736174073e-05, + "loss": 0.94, + "step": 380 + }, + { + "epoch": 0.17271078875793291, + "grad_norm": 1.0864166540196005, + "learning_rate": 2.763372620126927e-05, + "loss": 0.9515, + "step": 381 + }, + { + "epoch": 0.17316409791477788, + "grad_norm": 1.4109073582275067, + "learning_rate": 2.7706255666364465e-05, + "loss": 0.9492, + "step": 382 + }, + { + "epoch": 0.17361740707162285, + "grad_norm": 1.019247148616662, + "learning_rate": 2.7778785131459657e-05, + "loss": 0.9542, + "step": 383 + }, + { + "epoch": 0.17407071622846781, + "grad_norm": 1.4792739894601246, + "learning_rate": 2.7851314596554853e-05, + "loss": 0.9812, + "step": 384 + }, + { + "epoch": 0.17452402538531278, + "grad_norm": 1.1364851097800064, + "learning_rate": 2.792384406165005e-05, + "loss": 0.9637, + "step": 385 + }, + { + "epoch": 0.17497733454215775, + "grad_norm": 1.1947650453129484, + "learning_rate": 2.799637352674524e-05, + "loss": 0.9531, + "step": 386 + }, + { + "epoch": 0.17543064369900271, + "grad_norm": 1.192453878730054, + "learning_rate": 2.8068902991840437e-05, + "loss": 0.975, + "step": 387 + }, + { + "epoch": 0.17588395285584768, + "grad_norm": 1.3248162957588472, + "learning_rate": 2.8141432456935632e-05, + "loss": 0.9646, + "step": 388 + }, + { + "epoch": 0.17633726201269265, + "grad_norm": 1.2805823370885974, + "learning_rate": 2.8213961922030825e-05, + "loss": 0.9403, + "step": 389 + }, + { + "epoch": 0.1767905711695376, + "grad_norm": 1.266465542509188, + "learning_rate": 2.828649138712602e-05, + "loss": 0.9736, + "step": 390 + }, + { + "epoch": 0.17724388032638258, + "grad_norm": 2.0371687395076705, + "learning_rate": 2.8359020852221216e-05, + "loss": 0.958, + "step": 391 + }, + { + "epoch": 0.17769718948322757, + "grad_norm": 0.9297561826921558, + "learning_rate": 2.8431550317316415e-05, + "loss": 0.9381, + "step": 392 + }, + { + "epoch": 0.17815049864007254, + "grad_norm": 2.7848167599641656, + "learning_rate": 2.8504079782411607e-05, + "loss": 0.9506, + "step": 393 + }, + { + "epoch": 0.1786038077969175, + "grad_norm": 2.1763201975486255, + "learning_rate": 2.8576609247506803e-05, + "loss": 0.9662, + "step": 394 + }, + { + "epoch": 0.17905711695376247, + "grad_norm": 2.4169897403665606, + "learning_rate": 2.8649138712602e-05, + "loss": 0.95, + "step": 395 + }, + { + "epoch": 0.17951042611060744, + "grad_norm": 1.674867088697692, + "learning_rate": 2.8721668177697194e-05, + "loss": 0.9526, + "step": 396 + }, + { + "epoch": 0.1799637352674524, + "grad_norm": 2.651554107237099, + "learning_rate": 2.8794197642792387e-05, + "loss": 0.9594, + "step": 397 + }, + { + "epoch": 0.18041704442429737, + "grad_norm": 1.6943939649347142, + "learning_rate": 2.8866727107887582e-05, + "loss": 0.9418, + "step": 398 + }, + { + "epoch": 0.18087035358114234, + "grad_norm": 3.24733464439908, + "learning_rate": 2.8939256572982778e-05, + "loss": 0.9807, + "step": 399 + }, + { + "epoch": 0.1813236627379873, + "grad_norm": 2.848802228919831, + "learning_rate": 2.901178603807797e-05, + "loss": 0.96, + "step": 400 + }, + { + "epoch": 0.18177697189483227, + "grad_norm": 2.3903543908947307, + "learning_rate": 2.9084315503173166e-05, + "loss": 0.9622, + "step": 401 + }, + { + "epoch": 0.18223028105167724, + "grad_norm": 2.570811576203462, + "learning_rate": 2.9156844968268362e-05, + "loss": 0.9618, + "step": 402 + }, + { + "epoch": 0.1826835902085222, + "grad_norm": 1.7840259015514057, + "learning_rate": 2.9229374433363554e-05, + "loss": 0.9641, + "step": 403 + }, + { + "epoch": 0.18313689936536717, + "grad_norm": 1.9099188763348687, + "learning_rate": 2.930190389845875e-05, + "loss": 0.9565, + "step": 404 + }, + { + "epoch": 0.18359020852221214, + "grad_norm": 1.9098174286301464, + "learning_rate": 2.9374433363553945e-05, + "loss": 0.9709, + "step": 405 + }, + { + "epoch": 0.1840435176790571, + "grad_norm": 1.3590025674105495, + "learning_rate": 2.9446962828649138e-05, + "loss": 0.9554, + "step": 406 + }, + { + "epoch": 0.18449682683590207, + "grad_norm": 1.7368624137802966, + "learning_rate": 2.9519492293744334e-05, + "loss": 0.9706, + "step": 407 + }, + { + "epoch": 0.18495013599274707, + "grad_norm": 1.3745574589171, + "learning_rate": 2.9592021758839533e-05, + "loss": 0.9597, + "step": 408 + }, + { + "epoch": 0.18540344514959203, + "grad_norm": 1.4778064141083451, + "learning_rate": 2.9664551223934728e-05, + "loss": 0.9602, + "step": 409 + }, + { + "epoch": 0.185856754306437, + "grad_norm": 1.0653730846884035, + "learning_rate": 2.9737080689029924e-05, + "loss": 0.9636, + "step": 410 + }, + { + "epoch": 0.18631006346328197, + "grad_norm": 2.1234201480340276, + "learning_rate": 2.9809610154125116e-05, + "loss": 0.9707, + "step": 411 + }, + { + "epoch": 0.18676337262012693, + "grad_norm": 1.0590017972746357, + "learning_rate": 2.9882139619220312e-05, + "loss": 0.9585, + "step": 412 + }, + { + "epoch": 0.1872166817769719, + "grad_norm": 2.7210232300497235, + "learning_rate": 2.9954669084315508e-05, + "loss": 0.9452, + "step": 413 + }, + { + "epoch": 0.18766999093381687, + "grad_norm": 2.1877292119928664, + "learning_rate": 3.00271985494107e-05, + "loss": 0.9723, + "step": 414 + }, + { + "epoch": 0.18812330009066183, + "grad_norm": 2.0841889264754125, + "learning_rate": 3.0099728014505896e-05, + "loss": 0.989, + "step": 415 + }, + { + "epoch": 0.1885766092475068, + "grad_norm": 1.8469443919409643, + "learning_rate": 3.017225747960109e-05, + "loss": 0.9734, + "step": 416 + }, + { + "epoch": 0.18902991840435177, + "grad_norm": 1.6506258498520425, + "learning_rate": 3.0244786944696284e-05, + "loss": 0.9404, + "step": 417 + }, + { + "epoch": 0.18948322756119673, + "grad_norm": 1.923147191356826, + "learning_rate": 3.031731640979148e-05, + "loss": 0.9625, + "step": 418 + }, + { + "epoch": 0.1899365367180417, + "grad_norm": 1.2048827962157127, + "learning_rate": 3.0389845874886675e-05, + "loss": 0.9528, + "step": 419 + }, + { + "epoch": 0.19038984587488667, + "grad_norm": 2.4279715097804995, + "learning_rate": 3.0462375339981867e-05, + "loss": 0.974, + "step": 420 + }, + { + "epoch": 0.19084315503173163, + "grad_norm": 1.6050707457366387, + "learning_rate": 3.053490480507706e-05, + "loss": 0.9527, + "step": 421 + }, + { + "epoch": 0.1912964641885766, + "grad_norm": 2.541110518933097, + "learning_rate": 3.060743427017226e-05, + "loss": 0.962, + "step": 422 + }, + { + "epoch": 0.19174977334542156, + "grad_norm": 1.836933897500519, + "learning_rate": 3.0679963735267454e-05, + "loss": 0.9794, + "step": 423 + }, + { + "epoch": 0.19220308250226656, + "grad_norm": 2.4305183812740028, + "learning_rate": 3.075249320036265e-05, + "loss": 0.984, + "step": 424 + }, + { + "epoch": 0.19265639165911153, + "grad_norm": 1.5581896763457288, + "learning_rate": 3.0825022665457846e-05, + "loss": 0.9655, + "step": 425 + }, + { + "epoch": 0.1931097008159565, + "grad_norm": 2.58055892095061, + "learning_rate": 3.089755213055304e-05, + "loss": 0.9613, + "step": 426 + }, + { + "epoch": 0.19356300997280146, + "grad_norm": 1.9075019676816203, + "learning_rate": 3.097008159564824e-05, + "loss": 0.968, + "step": 427 + }, + { + "epoch": 0.19401631912964643, + "grad_norm": 2.752592179918321, + "learning_rate": 3.104261106074343e-05, + "loss": 0.9619, + "step": 428 + }, + { + "epoch": 0.1944696282864914, + "grad_norm": 2.1888998537634015, + "learning_rate": 3.111514052583862e-05, + "loss": 0.9445, + "step": 429 + }, + { + "epoch": 0.19492293744333636, + "grad_norm": 2.4776533717422744, + "learning_rate": 3.118766999093382e-05, + "loss": 0.9449, + "step": 430 + }, + { + "epoch": 0.19537624660018132, + "grad_norm": 2.3166200470254923, + "learning_rate": 3.126019945602901e-05, + "loss": 0.9533, + "step": 431 + }, + { + "epoch": 0.1958295557570263, + "grad_norm": 2.0543035818440196, + "learning_rate": 3.133272892112421e-05, + "loss": 0.9733, + "step": 432 + }, + { + "epoch": 0.19628286491387126, + "grad_norm": 1.725738023355161, + "learning_rate": 3.1405258386219405e-05, + "loss": 0.9399, + "step": 433 + }, + { + "epoch": 0.19673617407071622, + "grad_norm": 2.190044694058987, + "learning_rate": 3.14777878513146e-05, + "loss": 0.9537, + "step": 434 + }, + { + "epoch": 0.1971894832275612, + "grad_norm": 1.681817873763034, + "learning_rate": 3.1550317316409796e-05, + "loss": 0.9815, + "step": 435 + }, + { + "epoch": 0.19764279238440616, + "grad_norm": 2.4082255773002337, + "learning_rate": 3.1622846781504985e-05, + "loss": 0.9555, + "step": 436 + }, + { + "epoch": 0.19809610154125112, + "grad_norm": 1.6480505623629258, + "learning_rate": 3.169537624660018e-05, + "loss": 0.9342, + "step": 437 + }, + { + "epoch": 0.1985494106980961, + "grad_norm": 2.5776498102159335, + "learning_rate": 3.1767905711695376e-05, + "loss": 0.957, + "step": 438 + }, + { + "epoch": 0.19900271985494106, + "grad_norm": 1.9078277283950271, + "learning_rate": 3.184043517679057e-05, + "loss": 0.9373, + "step": 439 + }, + { + "epoch": 0.19945602901178605, + "grad_norm": 2.568161324239812, + "learning_rate": 3.191296464188577e-05, + "loss": 0.9794, + "step": 440 + }, + { + "epoch": 0.19990933816863102, + "grad_norm": 2.2894333232044555, + "learning_rate": 3.1985494106980963e-05, + "loss": 0.9586, + "step": 441 + }, + { + "epoch": 0.20036264732547598, + "grad_norm": 2.123231739178061, + "learning_rate": 3.205802357207616e-05, + "loss": 0.9494, + "step": 442 + }, + { + "epoch": 0.20081595648232095, + "grad_norm": 2.0163618298621904, + "learning_rate": 3.2130553037171355e-05, + "loss": 0.946, + "step": 443 + }, + { + "epoch": 0.20126926563916592, + "grad_norm": 2.1520317891360015, + "learning_rate": 3.220308250226655e-05, + "loss": 0.9458, + "step": 444 + }, + { + "epoch": 0.20172257479601088, + "grad_norm": 1.8116655801577795, + "learning_rate": 3.2275611967361746e-05, + "loss": 0.9492, + "step": 445 + }, + { + "epoch": 0.20217588395285585, + "grad_norm": 2.009897325477093, + "learning_rate": 3.234814143245694e-05, + "loss": 0.9541, + "step": 446 + }, + { + "epoch": 0.20262919310970082, + "grad_norm": 1.5442159378463132, + "learning_rate": 3.242067089755213e-05, + "loss": 0.9457, + "step": 447 + }, + { + "epoch": 0.20308250226654578, + "grad_norm": 2.173506011999066, + "learning_rate": 3.2493200362647327e-05, + "loss": 0.9721, + "step": 448 + }, + { + "epoch": 0.20353581142339075, + "grad_norm": 1.643863843059418, + "learning_rate": 3.256572982774252e-05, + "loss": 0.9632, + "step": 449 + }, + { + "epoch": 0.20398912058023572, + "grad_norm": 2.209913866183974, + "learning_rate": 3.263825929283772e-05, + "loss": 0.9707, + "step": 450 + }, + { + "epoch": 0.20444242973708068, + "grad_norm": 1.661173082303144, + "learning_rate": 3.2710788757932914e-05, + "loss": 0.9391, + "step": 451 + }, + { + "epoch": 0.20489573889392565, + "grad_norm": 2.253982197167722, + "learning_rate": 3.278331822302811e-05, + "loss": 0.9684, + "step": 452 + }, + { + "epoch": 0.20534904805077062, + "grad_norm": 2.0170203884876687, + "learning_rate": 3.28558476881233e-05, + "loss": 0.9648, + "step": 453 + }, + { + "epoch": 0.20580235720761558, + "grad_norm": 1.7536772328596786, + "learning_rate": 3.2928377153218494e-05, + "loss": 0.9788, + "step": 454 + }, + { + "epoch": 0.20625566636446055, + "grad_norm": 1.4875251459649068, + "learning_rate": 3.300090661831369e-05, + "loss": 0.9608, + "step": 455 + }, + { + "epoch": 0.20670897552130554, + "grad_norm": 2.0338352212820685, + "learning_rate": 3.307343608340889e-05, + "loss": 0.9463, + "step": 456 + }, + { + "epoch": 0.2071622846781505, + "grad_norm": 1.626294211075029, + "learning_rate": 3.314596554850408e-05, + "loss": 0.9316, + "step": 457 + }, + { + "epoch": 0.20761559383499548, + "grad_norm": 1.91432661999181, + "learning_rate": 3.321849501359928e-05, + "loss": 0.9559, + "step": 458 + }, + { + "epoch": 0.20806890299184044, + "grad_norm": 1.73936737312804, + "learning_rate": 3.329102447869447e-05, + "loss": 0.9504, + "step": 459 + }, + { + "epoch": 0.2085222121486854, + "grad_norm": 1.8456621164350187, + "learning_rate": 3.336355394378967e-05, + "loss": 0.9613, + "step": 460 + }, + { + "epoch": 0.20897552130553038, + "grad_norm": 1.5793257081467367, + "learning_rate": 3.3436083408884864e-05, + "loss": 0.9824, + "step": 461 + }, + { + "epoch": 0.20942883046237534, + "grad_norm": 1.716702803775347, + "learning_rate": 3.350861287398006e-05, + "loss": 0.955, + "step": 462 + }, + { + "epoch": 0.2098821396192203, + "grad_norm": 1.5963732824544927, + "learning_rate": 3.3581142339075255e-05, + "loss": 0.9643, + "step": 463 + }, + { + "epoch": 0.21033544877606528, + "grad_norm": 1.576318357655231, + "learning_rate": 3.3653671804170444e-05, + "loss": 0.939, + "step": 464 + }, + { + "epoch": 0.21078875793291024, + "grad_norm": 1.3341066719418122, + "learning_rate": 3.372620126926564e-05, + "loss": 0.966, + "step": 465 + }, + { + "epoch": 0.2112420670897552, + "grad_norm": 1.577007332868618, + "learning_rate": 3.3798730734360836e-05, + "loss": 0.9605, + "step": 466 + }, + { + "epoch": 0.21169537624660018, + "grad_norm": 1.2438279883518881, + "learning_rate": 3.387126019945603e-05, + "loss": 0.95, + "step": 467 + }, + { + "epoch": 0.21214868540344514, + "grad_norm": 1.7311023717383647, + "learning_rate": 3.394378966455123e-05, + "loss": 0.9691, + "step": 468 + }, + { + "epoch": 0.2126019945602901, + "grad_norm": 1.3532327479345592, + "learning_rate": 3.401631912964642e-05, + "loss": 0.9519, + "step": 469 + }, + { + "epoch": 0.21305530371713507, + "grad_norm": 1.4283240442390355, + "learning_rate": 3.408884859474161e-05, + "loss": 0.9568, + "step": 470 + }, + { + "epoch": 0.21350861287398004, + "grad_norm": 1.1996727896280002, + "learning_rate": 3.416137805983681e-05, + "loss": 0.9535, + "step": 471 + }, + { + "epoch": 0.21396192203082504, + "grad_norm": 1.454659709827934, + "learning_rate": 3.423390752493201e-05, + "loss": 0.96, + "step": 472 + }, + { + "epoch": 0.21441523118767, + "grad_norm": 1.4237580965066585, + "learning_rate": 3.4306436990027205e-05, + "loss": 0.9482, + "step": 473 + }, + { + "epoch": 0.21486854034451497, + "grad_norm": 1.2459222998644557, + "learning_rate": 3.43789664551224e-05, + "loss": 0.9298, + "step": 474 + }, + { + "epoch": 0.21532184950135994, + "grad_norm": 1.2253481611987795, + "learning_rate": 3.445149592021759e-05, + "loss": 0.9601, + "step": 475 + }, + { + "epoch": 0.2157751586582049, + "grad_norm": 1.459057401423948, + "learning_rate": 3.4524025385312786e-05, + "loss": 0.9606, + "step": 476 + }, + { + "epoch": 0.21622846781504987, + "grad_norm": 1.1233323514766091, + "learning_rate": 3.459655485040798e-05, + "loss": 0.9721, + "step": 477 + }, + { + "epoch": 0.21668177697189483, + "grad_norm": 1.3043576100009113, + "learning_rate": 3.466908431550318e-05, + "loss": 0.9525, + "step": 478 + }, + { + "epoch": 0.2171350861287398, + "grad_norm": 1.3832254596026332, + "learning_rate": 3.474161378059837e-05, + "loss": 0.9594, + "step": 479 + }, + { + "epoch": 0.21758839528558477, + "grad_norm": 1.2584601603856145, + "learning_rate": 3.481414324569357e-05, + "loss": 0.9455, + "step": 480 + }, + { + "epoch": 0.21804170444242973, + "grad_norm": 1.3025648768868698, + "learning_rate": 3.488667271078876e-05, + "loss": 0.9502, + "step": 481 + }, + { + "epoch": 0.2184950135992747, + "grad_norm": 1.275222027612667, + "learning_rate": 3.495920217588395e-05, + "loss": 0.9436, + "step": 482 + }, + { + "epoch": 0.21894832275611967, + "grad_norm": 1.7548243760034752, + "learning_rate": 3.503173164097915e-05, + "loss": 0.9569, + "step": 483 + }, + { + "epoch": 0.21940163191296463, + "grad_norm": 0.8399736419025778, + "learning_rate": 3.5104261106074345e-05, + "loss": 0.9551, + "step": 484 + }, + { + "epoch": 0.2198549410698096, + "grad_norm": 1.129363645823298, + "learning_rate": 3.517679057116954e-05, + "loss": 0.9859, + "step": 485 + }, + { + "epoch": 0.22030825022665457, + "grad_norm": 1.1626536011306632, + "learning_rate": 3.5249320036264736e-05, + "loss": 0.9453, + "step": 486 + }, + { + "epoch": 0.22076155938349953, + "grad_norm": 1.864673231136819, + "learning_rate": 3.532184950135993e-05, + "loss": 0.9649, + "step": 487 + }, + { + "epoch": 0.22121486854034453, + "grad_norm": 1.1042580111971072, + "learning_rate": 3.539437896645513e-05, + "loss": 0.9576, + "step": 488 + }, + { + "epoch": 0.2216681776971895, + "grad_norm": 1.1836427905143154, + "learning_rate": 3.546690843155032e-05, + "loss": 0.9517, + "step": 489 + }, + { + "epoch": 0.22212148685403446, + "grad_norm": 2.342247134267181, + "learning_rate": 3.553943789664552e-05, + "loss": 0.9655, + "step": 490 + }, + { + "epoch": 0.22257479601087943, + "grad_norm": 1.2170209970010542, + "learning_rate": 3.5611967361740714e-05, + "loss": 0.9519, + "step": 491 + }, + { + "epoch": 0.2230281051677244, + "grad_norm": 3.6069422122775756, + "learning_rate": 3.56844968268359e-05, + "loss": 0.966, + "step": 492 + }, + { + "epoch": 0.22348141432456936, + "grad_norm": 3.4020997299570754, + "learning_rate": 3.57570262919311e-05, + "loss": 0.9622, + "step": 493 + }, + { + "epoch": 0.22393472348141433, + "grad_norm": 2.0703202379124086, + "learning_rate": 3.5829555757026295e-05, + "loss": 0.9673, + "step": 494 + }, + { + "epoch": 0.2243880326382593, + "grad_norm": 1.8798428035499974, + "learning_rate": 3.590208522212149e-05, + "loss": 0.9524, + "step": 495 + }, + { + "epoch": 0.22484134179510426, + "grad_norm": 2.0689412249061245, + "learning_rate": 3.5974614687216686e-05, + "loss": 0.9547, + "step": 496 + }, + { + "epoch": 0.22529465095194923, + "grad_norm": 1.8412218637991171, + "learning_rate": 3.604714415231188e-05, + "loss": 0.9648, + "step": 497 + }, + { + "epoch": 0.2257479601087942, + "grad_norm": 1.7173059139337934, + "learning_rate": 3.611967361740707e-05, + "loss": 0.9845, + "step": 498 + }, + { + "epoch": 0.22620126926563916, + "grad_norm": 1.622864022129563, + "learning_rate": 3.6192203082502266e-05, + "loss": 0.9494, + "step": 499 + }, + { + "epoch": 0.22665457842248413, + "grad_norm": 1.2183858503471552, + "learning_rate": 3.626473254759746e-05, + "loss": 0.949, + "step": 500 + }, + { + "epoch": 0.2271078875793291, + "grad_norm": 2.020147289846656, + "learning_rate": 3.633726201269266e-05, + "loss": 0.9955, + "step": 501 + }, + { + "epoch": 0.22756119673617406, + "grad_norm": 1.4447638200900164, + "learning_rate": 3.6409791477787854e-05, + "loss": 0.9498, + "step": 502 + }, + { + "epoch": 0.22801450589301903, + "grad_norm": 1.7123218331671592, + "learning_rate": 3.648232094288305e-05, + "loss": 0.9547, + "step": 503 + }, + { + "epoch": 0.22846781504986402, + "grad_norm": 1.1189017955401865, + "learning_rate": 3.6554850407978245e-05, + "loss": 0.958, + "step": 504 + }, + { + "epoch": 0.228921124206709, + "grad_norm": 1.5710798014022602, + "learning_rate": 3.662737987307344e-05, + "loss": 0.9481, + "step": 505 + }, + { + "epoch": 0.22937443336355395, + "grad_norm": 1.0928368581759351, + "learning_rate": 3.6699909338168636e-05, + "loss": 0.9696, + "step": 506 + }, + { + "epoch": 0.22982774252039892, + "grad_norm": 1.4562855441557099, + "learning_rate": 3.677243880326383e-05, + "loss": 0.9558, + "step": 507 + }, + { + "epoch": 0.2302810516772439, + "grad_norm": 1.27946568955478, + "learning_rate": 3.684496826835903e-05, + "loss": 0.9729, + "step": 508 + }, + { + "epoch": 0.23073436083408885, + "grad_norm": 1.3648165411198019, + "learning_rate": 3.691749773345422e-05, + "loss": 0.9641, + "step": 509 + }, + { + "epoch": 0.23118766999093382, + "grad_norm": 1.2389923769002713, + "learning_rate": 3.699002719854941e-05, + "loss": 0.9485, + "step": 510 + }, + { + "epoch": 0.23164097914777879, + "grad_norm": 1.2884338009450194, + "learning_rate": 3.706255666364461e-05, + "loss": 0.9554, + "step": 511 + }, + { + "epoch": 0.23209428830462375, + "grad_norm": 1.2635214843844798, + "learning_rate": 3.7135086128739804e-05, + "loss": 0.96, + "step": 512 + }, + { + "epoch": 0.23254759746146872, + "grad_norm": 1.3449930784708084, + "learning_rate": 3.7207615593835e-05, + "loss": 0.9783, + "step": 513 + }, + { + "epoch": 0.23300090661831369, + "grad_norm": 1.2017070929523874, + "learning_rate": 3.7280145058930195e-05, + "loss": 0.9523, + "step": 514 + }, + { + "epoch": 0.23345421577515865, + "grad_norm": 1.182331614691288, + "learning_rate": 3.7352674524025384e-05, + "loss": 0.9534, + "step": 515 + }, + { + "epoch": 0.23390752493200362, + "grad_norm": 2.144501459609031, + "learning_rate": 3.742520398912058e-05, + "loss": 0.9432, + "step": 516 + }, + { + "epoch": 0.23436083408884859, + "grad_norm": 1.2747142778267662, + "learning_rate": 3.7497733454215775e-05, + "loss": 0.9566, + "step": 517 + }, + { + "epoch": 0.23481414324569355, + "grad_norm": 2.4620074969854047, + "learning_rate": 3.757026291931097e-05, + "loss": 0.9471, + "step": 518 + }, + { + "epoch": 0.23526745240253852, + "grad_norm": 1.8801220243051684, + "learning_rate": 3.764279238440617e-05, + "loss": 0.9402, + "step": 519 + }, + { + "epoch": 0.2357207615593835, + "grad_norm": 2.119286911603709, + "learning_rate": 3.771532184950136e-05, + "loss": 0.9531, + "step": 520 + }, + { + "epoch": 0.23617407071622848, + "grad_norm": 1.1300278682233014, + "learning_rate": 3.778785131459656e-05, + "loss": 0.9375, + "step": 521 + }, + { + "epoch": 0.23662737987307345, + "grad_norm": 1.6033175128383725, + "learning_rate": 3.7860380779691754e-05, + "loss": 0.9422, + "step": 522 + }, + { + "epoch": 0.2370806890299184, + "grad_norm": 1.4232237030481933, + "learning_rate": 3.793291024478695e-05, + "loss": 0.957, + "step": 523 + }, + { + "epoch": 0.23753399818676338, + "grad_norm": 1.3808325787943572, + "learning_rate": 3.8005439709882145e-05, + "loss": 0.9476, + "step": 524 + }, + { + "epoch": 0.23798730734360835, + "grad_norm": 1.6691142411930429, + "learning_rate": 3.807796917497734e-05, + "loss": 0.9373, + "step": 525 + }, + { + "epoch": 0.2384406165004533, + "grad_norm": 1.133448296100496, + "learning_rate": 3.815049864007253e-05, + "loss": 0.9161, + "step": 526 + }, + { + "epoch": 0.23889392565729828, + "grad_norm": 1.926215865516284, + "learning_rate": 3.8223028105167726e-05, + "loss": 0.9573, + "step": 527 + }, + { + "epoch": 0.23934723481414324, + "grad_norm": 1.340633701403204, + "learning_rate": 3.829555757026292e-05, + "loss": 0.9557, + "step": 528 + }, + { + "epoch": 0.2398005439709882, + "grad_norm": 2.1109409055127695, + "learning_rate": 3.836808703535812e-05, + "loss": 0.9671, + "step": 529 + }, + { + "epoch": 0.24025385312783318, + "grad_norm": 1.5979412222712766, + "learning_rate": 3.844061650045331e-05, + "loss": 0.9672, + "step": 530 + }, + { + "epoch": 0.24070716228467814, + "grad_norm": 1.8683967877175744, + "learning_rate": 3.851314596554851e-05, + "loss": 0.939, + "step": 531 + }, + { + "epoch": 0.2411604714415231, + "grad_norm": 1.9360932158405308, + "learning_rate": 3.8585675430643704e-05, + "loss": 0.9697, + "step": 532 + }, + { + "epoch": 0.24161378059836808, + "grad_norm": 1.5259129054713283, + "learning_rate": 3.865820489573889e-05, + "loss": 0.9466, + "step": 533 + }, + { + "epoch": 0.24206708975521304, + "grad_norm": 1.6509895065249705, + "learning_rate": 3.873073436083409e-05, + "loss": 0.9299, + "step": 534 + }, + { + "epoch": 0.242520398912058, + "grad_norm": 1.269177380338361, + "learning_rate": 3.8803263825929284e-05, + "loss": 0.9565, + "step": 535 + }, + { + "epoch": 0.242973708068903, + "grad_norm": 1.0338895898209688, + "learning_rate": 3.887579329102449e-05, + "loss": 0.959, + "step": 536 + }, + { + "epoch": 0.24342701722574797, + "grad_norm": 1.4818339572638375, + "learning_rate": 3.8948322756119676e-05, + "loss": 0.9588, + "step": 537 + }, + { + "epoch": 0.24388032638259294, + "grad_norm": 1.2722675390262228, + "learning_rate": 3.902085222121487e-05, + "loss": 0.9638, + "step": 538 + }, + { + "epoch": 0.2443336355394379, + "grad_norm": 1.3612266051887767, + "learning_rate": 3.909338168631007e-05, + "loss": 0.9274, + "step": 539 + }, + { + "epoch": 0.24478694469628287, + "grad_norm": 1.480495981829154, + "learning_rate": 3.916591115140526e-05, + "loss": 0.9382, + "step": 540 + }, + { + "epoch": 0.24524025385312784, + "grad_norm": 1.2829854988717788, + "learning_rate": 3.923844061650046e-05, + "loss": 0.959, + "step": 541 + }, + { + "epoch": 0.2456935630099728, + "grad_norm": 1.3780139592094205, + "learning_rate": 3.9310970081595654e-05, + "loss": 0.9221, + "step": 542 + }, + { + "epoch": 0.24614687216681777, + "grad_norm": 1.239582924070583, + "learning_rate": 3.938349954669084e-05, + "loss": 0.9503, + "step": 543 + }, + { + "epoch": 0.24660018132366274, + "grad_norm": 1.2936459163175882, + "learning_rate": 3.945602901178604e-05, + "loss": 0.9498, + "step": 544 + }, + { + "epoch": 0.2470534904805077, + "grad_norm": 1.3775933047312408, + "learning_rate": 3.9528558476881235e-05, + "loss": 0.9483, + "step": 545 + }, + { + "epoch": 0.24750679963735267, + "grad_norm": 1.5598119027454163, + "learning_rate": 3.960108794197643e-05, + "loss": 0.9447, + "step": 546 + }, + { + "epoch": 0.24796010879419764, + "grad_norm": 1.1998876824176332, + "learning_rate": 3.9673617407071626e-05, + "loss": 0.969, + "step": 547 + }, + { + "epoch": 0.2484134179510426, + "grad_norm": 1.0134801174318884, + "learning_rate": 3.974614687216682e-05, + "loss": 0.9386, + "step": 548 + }, + { + "epoch": 0.24886672710788757, + "grad_norm": 1.4381390276751846, + "learning_rate": 3.981867633726202e-05, + "loss": 0.9718, + "step": 549 + }, + { + "epoch": 0.24932003626473254, + "grad_norm": 1.4605196802219678, + "learning_rate": 3.9891205802357206e-05, + "loss": 0.9513, + "step": 550 + }, + { + "epoch": 0.2497733454215775, + "grad_norm": 1.4001566651018313, + "learning_rate": 3.99637352674524e-05, + "loss": 0.9295, + "step": 551 + }, + { + "epoch": 0.25022665457842247, + "grad_norm": 1.291957654122112, + "learning_rate": 4.00362647325476e-05, + "loss": 0.9462, + "step": 552 + }, + { + "epoch": 0.25067996373526746, + "grad_norm": 1.1999730788625291, + "learning_rate": 4.01087941976428e-05, + "loss": 0.9489, + "step": 553 + }, + { + "epoch": 0.2511332728921124, + "grad_norm": 1.5790423358996064, + "learning_rate": 4.018132366273799e-05, + "loss": 0.9635, + "step": 554 + }, + { + "epoch": 0.2515865820489574, + "grad_norm": 1.2381244912841012, + "learning_rate": 4.0253853127833185e-05, + "loss": 0.9403, + "step": 555 + }, + { + "epoch": 0.25203989120580234, + "grad_norm": 1.307376724226094, + "learning_rate": 4.0326382592928374e-05, + "loss": 0.9663, + "step": 556 + }, + { + "epoch": 0.25249320036264733, + "grad_norm": 1.3002750678175563, + "learning_rate": 4.0398912058023576e-05, + "loss": 0.9529, + "step": 557 + }, + { + "epoch": 0.25294650951949227, + "grad_norm": 1.6072989036837477, + "learning_rate": 4.0471441523118765e-05, + "loss": 0.9564, + "step": 558 + }, + { + "epoch": 0.25339981867633726, + "grad_norm": 1.0194923838105554, + "learning_rate": 4.054397098821397e-05, + "loss": 0.9326, + "step": 559 + }, + { + "epoch": 0.25385312783318226, + "grad_norm": 1.3024416403067682, + "learning_rate": 4.061650045330916e-05, + "loss": 0.9556, + "step": 560 + }, + { + "epoch": 0.2543064369900272, + "grad_norm": 1.7419771580738306, + "learning_rate": 4.068902991840435e-05, + "loss": 0.9594, + "step": 561 + }, + { + "epoch": 0.2547597461468722, + "grad_norm": 0.8971046707659623, + "learning_rate": 4.0761559383499555e-05, + "loss": 0.9467, + "step": 562 + }, + { + "epoch": 0.25521305530371713, + "grad_norm": 1.645313176607941, + "learning_rate": 4.0834088848594744e-05, + "loss": 0.9312, + "step": 563 + }, + { + "epoch": 0.2556663644605621, + "grad_norm": 1.195448452751475, + "learning_rate": 4.0906618313689946e-05, + "loss": 0.9589, + "step": 564 + }, + { + "epoch": 0.25611967361740706, + "grad_norm": 1.6457482280515048, + "learning_rate": 4.0979147778785135e-05, + "loss": 0.9454, + "step": 565 + }, + { + "epoch": 0.25657298277425206, + "grad_norm": 0.7854798695255296, + "learning_rate": 4.105167724388033e-05, + "loss": 0.9635, + "step": 566 + }, + { + "epoch": 0.257026291931097, + "grad_norm": 1.488885666451011, + "learning_rate": 4.112420670897552e-05, + "loss": 0.9485, + "step": 567 + }, + { + "epoch": 0.257479601087942, + "grad_norm": 1.1775176610973483, + "learning_rate": 4.119673617407072e-05, + "loss": 0.9373, + "step": 568 + }, + { + "epoch": 0.25793291024478693, + "grad_norm": 1.6447196547334462, + "learning_rate": 4.126926563916591e-05, + "loss": 0.9698, + "step": 569 + }, + { + "epoch": 0.2583862194016319, + "grad_norm": 1.064849081769065, + "learning_rate": 4.1341795104261113e-05, + "loss": 0.9679, + "step": 570 + }, + { + "epoch": 0.25883952855847686, + "grad_norm": 1.9217854084732204, + "learning_rate": 4.14143245693563e-05, + "loss": 0.957, + "step": 571 + }, + { + "epoch": 0.25929283771532186, + "grad_norm": 1.2819150792240919, + "learning_rate": 4.14868540344515e-05, + "loss": 0.9487, + "step": 572 + }, + { + "epoch": 0.2597461468721668, + "grad_norm": 1.540776515928187, + "learning_rate": 4.1559383499546694e-05, + "loss": 0.9262, + "step": 573 + }, + { + "epoch": 0.2601994560290118, + "grad_norm": 1.7253705410680575, + "learning_rate": 4.163191296464189e-05, + "loss": 0.96, + "step": 574 + }, + { + "epoch": 0.2606527651858567, + "grad_norm": 1.1089352134676513, + "learning_rate": 4.170444242973708e-05, + "loss": 0.9744, + "step": 575 + }, + { + "epoch": 0.2611060743427017, + "grad_norm": 1.777148657032061, + "learning_rate": 4.177697189483228e-05, + "loss": 0.949, + "step": 576 + }, + { + "epoch": 0.2615593834995467, + "grad_norm": 1.4040459375685417, + "learning_rate": 4.1849501359927477e-05, + "loss": 0.9524, + "step": 577 + }, + { + "epoch": 0.26201269265639165, + "grad_norm": 1.363661384446174, + "learning_rate": 4.1922030825022666e-05, + "loss": 0.9248, + "step": 578 + }, + { + "epoch": 0.26246600181323665, + "grad_norm": 1.6117646709108437, + "learning_rate": 4.199456029011787e-05, + "loss": 0.9726, + "step": 579 + }, + { + "epoch": 0.2629193109700816, + "grad_norm": 1.2476370201670928, + "learning_rate": 4.206708975521306e-05, + "loss": 0.9327, + "step": 580 + }, + { + "epoch": 0.2633726201269266, + "grad_norm": 1.4314195517941304, + "learning_rate": 4.213961922030826e-05, + "loss": 0.9463, + "step": 581 + }, + { + "epoch": 0.2638259292837715, + "grad_norm": 1.5430061204680638, + "learning_rate": 4.221214868540345e-05, + "loss": 0.9456, + "step": 582 + }, + { + "epoch": 0.2642792384406165, + "grad_norm": 1.0488828633476652, + "learning_rate": 4.2284678150498644e-05, + "loss": 0.9476, + "step": 583 + }, + { + "epoch": 0.26473254759746145, + "grad_norm": 2.1197775593819923, + "learning_rate": 4.235720761559383e-05, + "loss": 0.9314, + "step": 584 + }, + { + "epoch": 0.26518585675430645, + "grad_norm": 1.8056138737176077, + "learning_rate": 4.2429737080689035e-05, + "loss": 0.9614, + "step": 585 + }, + { + "epoch": 0.2656391659111514, + "grad_norm": 1.5801111277044997, + "learning_rate": 4.2502266545784224e-05, + "loss": 0.966, + "step": 586 + }, + { + "epoch": 0.2660924750679964, + "grad_norm": 1.5349632629898555, + "learning_rate": 4.257479601087943e-05, + "loss": 0.9573, + "step": 587 + }, + { + "epoch": 0.2665457842248413, + "grad_norm": 1.556743587544592, + "learning_rate": 4.2647325475974616e-05, + "loss": 0.9504, + "step": 588 + }, + { + "epoch": 0.2669990933816863, + "grad_norm": 1.1596358445369075, + "learning_rate": 4.271985494106981e-05, + "loss": 0.9672, + "step": 589 + }, + { + "epoch": 0.26745240253853125, + "grad_norm": 1.195398568484075, + "learning_rate": 4.279238440616501e-05, + "loss": 0.9626, + "step": 590 + }, + { + "epoch": 0.26790571169537625, + "grad_norm": 1.1033787807812672, + "learning_rate": 4.28649138712602e-05, + "loss": 0.9523, + "step": 591 + }, + { + "epoch": 0.26835902085222124, + "grad_norm": 1.0042108309685107, + "learning_rate": 4.2937443336355405e-05, + "loss": 0.9548, + "step": 592 + }, + { + "epoch": 0.2688123300090662, + "grad_norm": 1.9032736640558985, + "learning_rate": 4.3009972801450594e-05, + "loss": 0.9399, + "step": 593 + }, + { + "epoch": 0.2692656391659112, + "grad_norm": 1.136651589882562, + "learning_rate": 4.308250226654579e-05, + "loss": 0.956, + "step": 594 + }, + { + "epoch": 0.2697189483227561, + "grad_norm": 1.7103361967878274, + "learning_rate": 4.315503173164098e-05, + "loss": 0.9482, + "step": 595 + }, + { + "epoch": 0.2701722574796011, + "grad_norm": 1.541116995730545, + "learning_rate": 4.322756119673618e-05, + "loss": 0.9682, + "step": 596 + }, + { + "epoch": 0.27062556663644605, + "grad_norm": 1.1267202170601, + "learning_rate": 4.330009066183137e-05, + "loss": 0.942, + "step": 597 + }, + { + "epoch": 0.27107887579329104, + "grad_norm": 1.565888393703851, + "learning_rate": 4.337262012692657e-05, + "loss": 0.9377, + "step": 598 + }, + { + "epoch": 0.271532184950136, + "grad_norm": 1.4273425430379525, + "learning_rate": 4.344514959202176e-05, + "loss": 0.9437, + "step": 599 + }, + { + "epoch": 0.271985494106981, + "grad_norm": 1.5600849488326662, + "learning_rate": 4.351767905711696e-05, + "loss": 0.9476, + "step": 600 + }, + { + "epoch": 0.2724388032638259, + "grad_norm": 1.135573122780415, + "learning_rate": 4.3590208522212146e-05, + "loss": 0.9428, + "step": 601 + }, + { + "epoch": 0.2728921124206709, + "grad_norm": 1.3490206870238195, + "learning_rate": 4.366273798730735e-05, + "loss": 0.9691, + "step": 602 + }, + { + "epoch": 0.27334542157751585, + "grad_norm": 1.128610946917998, + "learning_rate": 4.373526745240254e-05, + "loss": 0.9489, + "step": 603 + }, + { + "epoch": 0.27379873073436084, + "grad_norm": 1.6060508246099943, + "learning_rate": 4.380779691749774e-05, + "loss": 0.9397, + "step": 604 + }, + { + "epoch": 0.2742520398912058, + "grad_norm": 1.6422481783015415, + "learning_rate": 4.388032638259293e-05, + "loss": 0.9442, + "step": 605 + }, + { + "epoch": 0.2747053490480508, + "grad_norm": 1.1786944961955894, + "learning_rate": 4.3952855847688125e-05, + "loss": 0.9471, + "step": 606 + }, + { + "epoch": 0.2751586582048957, + "grad_norm": 1.4822634265150343, + "learning_rate": 4.402538531278332e-05, + "loss": 0.9806, + "step": 607 + }, + { + "epoch": 0.2756119673617407, + "grad_norm": 1.1157046712485972, + "learning_rate": 4.4097914777878516e-05, + "loss": 0.9593, + "step": 608 + }, + { + "epoch": 0.2760652765185857, + "grad_norm": 1.2696351292897434, + "learning_rate": 4.417044424297372e-05, + "loss": 0.9485, + "step": 609 + }, + { + "epoch": 0.27651858567543064, + "grad_norm": 1.5004429262170311, + "learning_rate": 4.424297370806891e-05, + "loss": 0.9605, + "step": 610 + }, + { + "epoch": 0.27697189483227563, + "grad_norm": 1.3166953584045524, + "learning_rate": 4.43155031731641e-05, + "loss": 0.9308, + "step": 611 + }, + { + "epoch": 0.2774252039891206, + "grad_norm": 1.5383699431969209, + "learning_rate": 4.438803263825929e-05, + "loss": 0.955, + "step": 612 + }, + { + "epoch": 0.27787851314596557, + "grad_norm": 1.3181004348883552, + "learning_rate": 4.4460562103354495e-05, + "loss": 0.9264, + "step": 613 + }, + { + "epoch": 0.2783318223028105, + "grad_norm": 1.4599614054002643, + "learning_rate": 4.4533091568449684e-05, + "loss": 0.9313, + "step": 614 + }, + { + "epoch": 0.2787851314596555, + "grad_norm": 1.23469334382025, + "learning_rate": 4.4605621033544886e-05, + "loss": 0.9534, + "step": 615 + }, + { + "epoch": 0.27923844061650044, + "grad_norm": 1.45190394219971, + "learning_rate": 4.4678150498640075e-05, + "loss": 0.9223, + "step": 616 + }, + { + "epoch": 0.27969174977334543, + "grad_norm": 1.9880340072203195, + "learning_rate": 4.475067996373527e-05, + "loss": 0.9432, + "step": 617 + }, + { + "epoch": 0.28014505893019037, + "grad_norm": 0.7192033033206872, + "learning_rate": 4.4823209428830466e-05, + "loss": 0.9431, + "step": 618 + }, + { + "epoch": 0.28059836808703537, + "grad_norm": 1.4351317849040863, + "learning_rate": 4.489573889392566e-05, + "loss": 0.942, + "step": 619 + }, + { + "epoch": 0.2810516772438803, + "grad_norm": 1.8447619120650107, + "learning_rate": 4.496826835902085e-05, + "loss": 0.9388, + "step": 620 + }, + { + "epoch": 0.2815049864007253, + "grad_norm": 1.2907046235938708, + "learning_rate": 4.504079782411605e-05, + "loss": 0.9585, + "step": 621 + }, + { + "epoch": 0.28195829555757024, + "grad_norm": 1.2251979616107977, + "learning_rate": 4.511332728921124e-05, + "loss": 0.9501, + "step": 622 + }, + { + "epoch": 0.28241160471441523, + "grad_norm": 1.551927285602187, + "learning_rate": 4.518585675430644e-05, + "loss": 0.946, + "step": 623 + }, + { + "epoch": 0.2828649138712602, + "grad_norm": 2.061551751088097, + "learning_rate": 4.525838621940164e-05, + "loss": 0.9285, + "step": 624 + }, + { + "epoch": 0.28331822302810517, + "grad_norm": 0.7512781294983745, + "learning_rate": 4.533091568449683e-05, + "loss": 0.9371, + "step": 625 + }, + { + "epoch": 0.28377153218495016, + "grad_norm": 2.0405974876420636, + "learning_rate": 4.540344514959203e-05, + "loss": 0.9567, + "step": 626 + }, + { + "epoch": 0.2842248413417951, + "grad_norm": 1.4742751324848076, + "learning_rate": 4.547597461468722e-05, + "loss": 0.9422, + "step": 627 + }, + { + "epoch": 0.2846781504986401, + "grad_norm": 1.5328943781014133, + "learning_rate": 4.5548504079782416e-05, + "loss": 0.9357, + "step": 628 + }, + { + "epoch": 0.28513145965548503, + "grad_norm": 1.3232319267999346, + "learning_rate": 4.5621033544877605e-05, + "loss": 0.9566, + "step": 629 + }, + { + "epoch": 0.28558476881233, + "grad_norm": 1.1144389268150887, + "learning_rate": 4.569356300997281e-05, + "loss": 0.9292, + "step": 630 + }, + { + "epoch": 0.28603807796917496, + "grad_norm": 1.2136622195056797, + "learning_rate": 4.5766092475068e-05, + "loss": 0.9519, + "step": 631 + }, + { + "epoch": 0.28649138712601996, + "grad_norm": 1.306717656271579, + "learning_rate": 4.58386219401632e-05, + "loss": 0.9279, + "step": 632 + }, + { + "epoch": 0.2869446962828649, + "grad_norm": 1.4432515955110041, + "learning_rate": 4.591115140525839e-05, + "loss": 0.9349, + "step": 633 + }, + { + "epoch": 0.2873980054397099, + "grad_norm": 1.4164479779407202, + "learning_rate": 4.5983680870353584e-05, + "loss": 0.9474, + "step": 634 + }, + { + "epoch": 0.28785131459655483, + "grad_norm": 1.5294999958974749, + "learning_rate": 4.605621033544878e-05, + "loss": 0.9445, + "step": 635 + }, + { + "epoch": 0.2883046237533998, + "grad_norm": 1.5895802553551033, + "learning_rate": 4.6128739800543975e-05, + "loss": 0.9322, + "step": 636 + }, + { + "epoch": 0.28875793291024476, + "grad_norm": 1.498006153484362, + "learning_rate": 4.6201269265639164e-05, + "loss": 0.9553, + "step": 637 + }, + { + "epoch": 0.28921124206708976, + "grad_norm": 1.1755218537203898, + "learning_rate": 4.627379873073437e-05, + "loss": 0.9383, + "step": 638 + }, + { + "epoch": 0.2896645512239347, + "grad_norm": 1.4137079765174556, + "learning_rate": 4.6346328195829556e-05, + "loss": 0.9408, + "step": 639 + }, + { + "epoch": 0.2901178603807797, + "grad_norm": 1.0248597069673635, + "learning_rate": 4.641885766092475e-05, + "loss": 0.9334, + "step": 640 + }, + { + "epoch": 0.2905711695376247, + "grad_norm": 2.2747519060711303, + "learning_rate": 4.6491387126019954e-05, + "loss": 0.9202, + "step": 641 + }, + { + "epoch": 0.2910244786944696, + "grad_norm": 1.3427678665040443, + "learning_rate": 4.656391659111514e-05, + "loss": 0.9299, + "step": 642 + }, + { + "epoch": 0.2914777878513146, + "grad_norm": 1.6888287052454223, + "learning_rate": 4.6636446056210345e-05, + "loss": 0.9339, + "step": 643 + }, + { + "epoch": 0.29193109700815956, + "grad_norm": 2.0069576737966837, + "learning_rate": 4.6708975521305534e-05, + "loss": 0.9366, + "step": 644 + }, + { + "epoch": 0.29238440616500455, + "grad_norm": 1.0789915284508653, + "learning_rate": 4.678150498640073e-05, + "loss": 0.9531, + "step": 645 + }, + { + "epoch": 0.2928377153218495, + "grad_norm": 2.023101410388226, + "learning_rate": 4.6854034451495925e-05, + "loss": 0.9714, + "step": 646 + }, + { + "epoch": 0.2932910244786945, + "grad_norm": 1.1164339111917163, + "learning_rate": 4.692656391659112e-05, + "loss": 0.9527, + "step": 647 + }, + { + "epoch": 0.2937443336355394, + "grad_norm": 1.9092266421533028, + "learning_rate": 4.699909338168631e-05, + "loss": 0.9579, + "step": 648 + }, + { + "epoch": 0.2941976427923844, + "grad_norm": 1.4251562678565066, + "learning_rate": 4.707162284678151e-05, + "loss": 0.943, + "step": 649 + }, + { + "epoch": 0.29465095194922936, + "grad_norm": 1.643575347582023, + "learning_rate": 4.71441523118767e-05, + "loss": 0.9401, + "step": 650 + }, + { + "epoch": 0.29510426110607435, + "grad_norm": 1.427653154543193, + "learning_rate": 4.72166817769719e-05, + "loss": 0.9246, + "step": 651 + }, + { + "epoch": 0.2955575702629193, + "grad_norm": 1.6420314732057328, + "learning_rate": 4.728921124206709e-05, + "loss": 0.9473, + "step": 652 + }, + { + "epoch": 0.2960108794197643, + "grad_norm": 1.5372403835920674, + "learning_rate": 4.736174070716229e-05, + "loss": 0.9394, + "step": 653 + }, + { + "epoch": 0.2964641885766092, + "grad_norm": 1.443002825729381, + "learning_rate": 4.743427017225748e-05, + "loss": 0.9411, + "step": 654 + }, + { + "epoch": 0.2969174977334542, + "grad_norm": 1.383833767613978, + "learning_rate": 4.750679963735268e-05, + "loss": 0.9527, + "step": 655 + }, + { + "epoch": 0.2973708068902992, + "grad_norm": 1.2672921924395384, + "learning_rate": 4.7579329102447876e-05, + "loss": 0.9497, + "step": 656 + }, + { + "epoch": 0.29782411604714415, + "grad_norm": 1.322260122751291, + "learning_rate": 4.7651858567543065e-05, + "loss": 0.9572, + "step": 657 + }, + { + "epoch": 0.29827742520398914, + "grad_norm": 1.4198954924053768, + "learning_rate": 4.772438803263827e-05, + "loss": 0.9245, + "step": 658 + }, + { + "epoch": 0.2987307343608341, + "grad_norm": 1.6074043812265746, + "learning_rate": 4.7796917497733456e-05, + "loss": 0.9393, + "step": 659 + }, + { + "epoch": 0.2991840435176791, + "grad_norm": 1.3889537307043998, + "learning_rate": 4.786944696282866e-05, + "loss": 0.947, + "step": 660 + }, + { + "epoch": 0.299637352674524, + "grad_norm": 1.4247655267517096, + "learning_rate": 4.794197642792385e-05, + "loss": 0.9405, + "step": 661 + }, + { + "epoch": 0.300090661831369, + "grad_norm": 1.20279970873019, + "learning_rate": 4.801450589301904e-05, + "loss": 0.9587, + "step": 662 + }, + { + "epoch": 0.30054397098821395, + "grad_norm": 1.3786019714696676, + "learning_rate": 4.808703535811424e-05, + "loss": 0.9411, + "step": 663 + }, + { + "epoch": 0.30099728014505894, + "grad_norm": 1.929617923565853, + "learning_rate": 4.8159564823209434e-05, + "loss": 0.9344, + "step": 664 + }, + { + "epoch": 0.3014505893019039, + "grad_norm": 1.0067407455516777, + "learning_rate": 4.823209428830462e-05, + "loss": 0.9178, + "step": 665 + }, + { + "epoch": 0.3019038984587489, + "grad_norm": 1.549228113666535, + "learning_rate": 4.8304623753399826e-05, + "loss": 0.9343, + "step": 666 + }, + { + "epoch": 0.3023572076155938, + "grad_norm": 0.7627259150470267, + "learning_rate": 4.8377153218495015e-05, + "loss": 0.9463, + "step": 667 + }, + { + "epoch": 0.3028105167724388, + "grad_norm": 1.118071455001717, + "learning_rate": 4.844968268359021e-05, + "loss": 0.9403, + "step": 668 + }, + { + "epoch": 0.30326382592928375, + "grad_norm": 2.085374202560522, + "learning_rate": 4.8522212148685406e-05, + "loss": 0.951, + "step": 669 + }, + { + "epoch": 0.30371713508612874, + "grad_norm": 1.4349210170197022, + "learning_rate": 4.85947416137806e-05, + "loss": 0.9274, + "step": 670 + }, + { + "epoch": 0.3041704442429737, + "grad_norm": 1.5069733806820143, + "learning_rate": 4.866727107887579e-05, + "loss": 0.9479, + "step": 671 + }, + { + "epoch": 0.3046237533998187, + "grad_norm": 1.7533379438880692, + "learning_rate": 4.873980054397099e-05, + "loss": 0.9279, + "step": 672 + }, + { + "epoch": 0.30507706255666367, + "grad_norm": 1.5943690943812319, + "learning_rate": 4.881233000906619e-05, + "loss": 0.9211, + "step": 673 + }, + { + "epoch": 0.3055303717135086, + "grad_norm": 1.8387553355893573, + "learning_rate": 4.8884859474161385e-05, + "loss": 0.9294, + "step": 674 + }, + { + "epoch": 0.3059836808703536, + "grad_norm": 1.1760719368516273, + "learning_rate": 4.895738893925658e-05, + "loss": 0.9575, + "step": 675 + }, + { + "epoch": 0.30643699002719854, + "grad_norm": 1.4345071390854536, + "learning_rate": 4.902991840435177e-05, + "loss": 0.9344, + "step": 676 + }, + { + "epoch": 0.30689029918404354, + "grad_norm": 1.2391889309053328, + "learning_rate": 4.910244786944697e-05, + "loss": 0.9552, + "step": 677 + }, + { + "epoch": 0.3073436083408885, + "grad_norm": 2.318642027700126, + "learning_rate": 4.917497733454216e-05, + "loss": 0.9248, + "step": 678 + }, + { + "epoch": 0.30779691749773347, + "grad_norm": 1.2668720948466181, + "learning_rate": 4.9247506799637356e-05, + "loss": 0.9259, + "step": 679 + }, + { + "epoch": 0.3082502266545784, + "grad_norm": 2.398702632279838, + "learning_rate": 4.932003626473255e-05, + "loss": 0.9419, + "step": 680 + }, + { + "epoch": 0.3087035358114234, + "grad_norm": 2.0310362041673673, + "learning_rate": 4.939256572982775e-05, + "loss": 0.9293, + "step": 681 + }, + { + "epoch": 0.30915684496826834, + "grad_norm": 1.4453520499587376, + "learning_rate": 4.946509519492294e-05, + "loss": 0.9602, + "step": 682 + }, + { + "epoch": 0.30961015412511333, + "grad_norm": 2.011099408055547, + "learning_rate": 4.953762466001814e-05, + "loss": 0.9286, + "step": 683 + }, + { + "epoch": 0.3100634632819583, + "grad_norm": 1.2684667838083392, + "learning_rate": 4.961015412511333e-05, + "loss": 0.9299, + "step": 684 + }, + { + "epoch": 0.31051677243880327, + "grad_norm": 1.5747713645808834, + "learning_rate": 4.9682683590208524e-05, + "loss": 0.9476, + "step": 685 + }, + { + "epoch": 0.3109700815956482, + "grad_norm": 1.123407952679056, + "learning_rate": 4.975521305530372e-05, + "loss": 0.9522, + "step": 686 + }, + { + "epoch": 0.3114233907524932, + "grad_norm": 1.597207251130449, + "learning_rate": 4.9827742520398915e-05, + "loss": 0.956, + "step": 687 + }, + { + "epoch": 0.3118766999093382, + "grad_norm": 1.6820258157634909, + "learning_rate": 4.990027198549412e-05, + "loss": 0.9559, + "step": 688 + }, + { + "epoch": 0.31233000906618313, + "grad_norm": 1.4018764858507986, + "learning_rate": 4.9972801450589307e-05, + "loss": 0.9442, + "step": 689 + }, + { + "epoch": 0.31278331822302813, + "grad_norm": 1.6131708884834055, + "learning_rate": 5.00453309156845e-05, + "loss": 0.9326, + "step": 690 + }, + { + "epoch": 0.31323662737987307, + "grad_norm": 1.2462099631286407, + "learning_rate": 5.01178603807797e-05, + "loss": 0.9487, + "step": 691 + }, + { + "epoch": 0.31368993653671806, + "grad_norm": 1.5208327473941317, + "learning_rate": 5.0190389845874894e-05, + "loss": 0.9424, + "step": 692 + }, + { + "epoch": 0.314143245693563, + "grad_norm": 1.4826036015931, + "learning_rate": 5.026291931097008e-05, + "loss": 0.9336, + "step": 693 + }, + { + "epoch": 0.314596554850408, + "grad_norm": 1.0222214380788672, + "learning_rate": 5.0335448776065285e-05, + "loss": 0.9289, + "step": 694 + }, + { + "epoch": 0.31504986400725293, + "grad_norm": 2.2856275746771444, + "learning_rate": 5.0407978241160474e-05, + "loss": 0.9511, + "step": 695 + }, + { + "epoch": 0.3155031731640979, + "grad_norm": 1.606043549452128, + "learning_rate": 5.048050770625567e-05, + "loss": 0.968, + "step": 696 + }, + { + "epoch": 0.31595648232094287, + "grad_norm": 1.8605764784677907, + "learning_rate": 5.0553037171350865e-05, + "loss": 0.9382, + "step": 697 + }, + { + "epoch": 0.31640979147778786, + "grad_norm": 1.9170253947578217, + "learning_rate": 5.062556663644606e-05, + "loss": 0.9168, + "step": 698 + }, + { + "epoch": 0.3168631006346328, + "grad_norm": 1.4747125669832035, + "learning_rate": 5.069809610154125e-05, + "loss": 0.9577, + "step": 699 + }, + { + "epoch": 0.3173164097914778, + "grad_norm": 1.1456782744336196, + "learning_rate": 5.077062556663645e-05, + "loss": 0.9277, + "step": 700 + }, + { + "epoch": 0.31776971894832273, + "grad_norm": 2.4945305919427265, + "learning_rate": 5.084315503173164e-05, + "loss": 0.9382, + "step": 701 + }, + { + "epoch": 0.3182230281051677, + "grad_norm": 1.7506456691391596, + "learning_rate": 5.0915684496826844e-05, + "loss": 0.9247, + "step": 702 + }, + { + "epoch": 0.31867633726201267, + "grad_norm": 1.9290101775128465, + "learning_rate": 5.098821396192203e-05, + "loss": 0.9278, + "step": 703 + }, + { + "epoch": 0.31912964641885766, + "grad_norm": 1.943948849995102, + "learning_rate": 5.106074342701723e-05, + "loss": 0.9409, + "step": 704 + }, + { + "epoch": 0.31958295557570265, + "grad_norm": 1.0925635074806124, + "learning_rate": 5.113327289211243e-05, + "loss": 0.9767, + "step": 705 + }, + { + "epoch": 0.3200362647325476, + "grad_norm": 2.7443835444337688, + "learning_rate": 5.120580235720762e-05, + "loss": 0.9459, + "step": 706 + }, + { + "epoch": 0.3204895738893926, + "grad_norm": 1.6399571873406757, + "learning_rate": 5.1278331822302816e-05, + "loss": 0.9416, + "step": 707 + }, + { + "epoch": 0.3209428830462375, + "grad_norm": 2.772354970587644, + "learning_rate": 5.135086128739801e-05, + "loss": 0.9653, + "step": 708 + }, + { + "epoch": 0.3213961922030825, + "grad_norm": 2.414772295833044, + "learning_rate": 5.142339075249321e-05, + "loss": 0.9551, + "step": 709 + }, + { + "epoch": 0.32184950135992746, + "grad_norm": 2.060630552139286, + "learning_rate": 5.1495920217588396e-05, + "loss": 0.9738, + "step": 710 + }, + { + "epoch": 0.32230281051677245, + "grad_norm": 2.2322406420972327, + "learning_rate": 5.15684496826836e-05, + "loss": 0.9462, + "step": 711 + }, + { + "epoch": 0.3227561196736174, + "grad_norm": 1.5836345924954258, + "learning_rate": 5.164097914777879e-05, + "loss": 0.9437, + "step": 712 + }, + { + "epoch": 0.3232094288304624, + "grad_norm": 2.456757066423452, + "learning_rate": 5.171350861287398e-05, + "loss": 0.9554, + "step": 713 + }, + { + "epoch": 0.3236627379873073, + "grad_norm": 1.694641394167381, + "learning_rate": 5.178603807796918e-05, + "loss": 0.947, + "step": 714 + }, + { + "epoch": 0.3241160471441523, + "grad_norm": 2.243767862017921, + "learning_rate": 5.1858567543064374e-05, + "loss": 0.91, + "step": 715 + }, + { + "epoch": 0.32456935630099726, + "grad_norm": 1.872888216798527, + "learning_rate": 5.193109700815956e-05, + "loss": 0.936, + "step": 716 + }, + { + "epoch": 0.32502266545784225, + "grad_norm": 1.6499139879123348, + "learning_rate": 5.2003626473254766e-05, + "loss": 0.9561, + "step": 717 + }, + { + "epoch": 0.3254759746146872, + "grad_norm": 14.27191138969359, + "learning_rate": 5.2076155938349955e-05, + "loss": 0.9815, + "step": 718 + }, + { + "epoch": 0.3259292837715322, + "grad_norm": 4.906331644511764, + "learning_rate": 5.214868540344516e-05, + "loss": 0.979, + "step": 719 + }, + { + "epoch": 0.3263825929283772, + "grad_norm": 2.70976793228864, + "learning_rate": 5.222121486854035e-05, + "loss": 0.9663, + "step": 720 + }, + { + "epoch": 0.3268359020852221, + "grad_norm": 3.5571947908840644, + "learning_rate": 5.229374433363554e-05, + "loss": 0.9513, + "step": 721 + }, + { + "epoch": 0.3272892112420671, + "grad_norm": 3.6818530706869774, + "learning_rate": 5.2366273798730744e-05, + "loss": 0.9455, + "step": 722 + }, + { + "epoch": 0.32774252039891205, + "grad_norm": 2.645555123605198, + "learning_rate": 5.243880326382593e-05, + "loss": 0.965, + "step": 723 + }, + { + "epoch": 0.32819582955575705, + "grad_norm": 2.784085621744387, + "learning_rate": 5.251133272892113e-05, + "loss": 0.9533, + "step": 724 + }, + { + "epoch": 0.328649138712602, + "grad_norm": 2.3024027886423517, + "learning_rate": 5.2583862194016325e-05, + "loss": 0.9354, + "step": 725 + }, + { + "epoch": 0.329102447869447, + "grad_norm": 2.579127658747222, + "learning_rate": 5.265639165911152e-05, + "loss": 0.9397, + "step": 726 + }, + { + "epoch": 0.3295557570262919, + "grad_norm": 1.2708395013442761, + "learning_rate": 5.272892112420671e-05, + "loss": 0.9657, + "step": 727 + }, + { + "epoch": 0.3300090661831369, + "grad_norm": 1.9629383819187782, + "learning_rate": 5.280145058930191e-05, + "loss": 0.952, + "step": 728 + }, + { + "epoch": 0.33046237533998185, + "grad_norm": 1.8655648944363101, + "learning_rate": 5.28739800543971e-05, + "loss": 0.953, + "step": 729 + }, + { + "epoch": 0.33091568449682685, + "grad_norm": 1.2396388000732834, + "learning_rate": 5.2946509519492296e-05, + "loss": 0.9355, + "step": 730 + }, + { + "epoch": 0.3313689936536718, + "grad_norm": 1.8809795764883877, + "learning_rate": 5.301903898458749e-05, + "loss": 0.9526, + "step": 731 + }, + { + "epoch": 0.3318223028105168, + "grad_norm": 1.3191286502350539, + "learning_rate": 5.309156844968269e-05, + "loss": 0.9319, + "step": 732 + }, + { + "epoch": 0.3322756119673617, + "grad_norm": 1.9798303247571511, + "learning_rate": 5.3164097914777877e-05, + "loss": 0.9374, + "step": 733 + }, + { + "epoch": 0.3327289211242067, + "grad_norm": 1.2759488480503673, + "learning_rate": 5.323662737987308e-05, + "loss": 0.965, + "step": 734 + }, + { + "epoch": 0.33318223028105165, + "grad_norm": 1.8322738341727243, + "learning_rate": 5.330915684496827e-05, + "loss": 0.963, + "step": 735 + }, + { + "epoch": 0.33363553943789664, + "grad_norm": 1.6352896069820073, + "learning_rate": 5.338168631006347e-05, + "loss": 0.9737, + "step": 736 + }, + { + "epoch": 0.33408884859474164, + "grad_norm": 1.3918609316239046, + "learning_rate": 5.3454215775158666e-05, + "loss": 0.9438, + "step": 737 + }, + { + "epoch": 0.3345421577515866, + "grad_norm": 1.3228035517395398, + "learning_rate": 5.3526745240253855e-05, + "loss": 0.9208, + "step": 738 + }, + { + "epoch": 0.33499546690843157, + "grad_norm": 1.6255087275273532, + "learning_rate": 5.359927470534906e-05, + "loss": 0.9404, + "step": 739 + }, + { + "epoch": 0.3354487760652765, + "grad_norm": 1.3172359407571101, + "learning_rate": 5.3671804170444246e-05, + "loss": 0.941, + "step": 740 + }, + { + "epoch": 0.3359020852221215, + "grad_norm": 1.6297905344371948, + "learning_rate": 5.374433363553944e-05, + "loss": 0.9437, + "step": 741 + }, + { + "epoch": 0.33635539437896644, + "grad_norm": 1.3056419101852335, + "learning_rate": 5.381686310063464e-05, + "loss": 0.9404, + "step": 742 + }, + { + "epoch": 0.33680870353581144, + "grad_norm": 1.3377445143115474, + "learning_rate": 5.3889392565729834e-05, + "loss": 0.9263, + "step": 743 + }, + { + "epoch": 0.3372620126926564, + "grad_norm": 1.001597063894214, + "learning_rate": 5.396192203082502e-05, + "loss": 0.9271, + "step": 744 + }, + { + "epoch": 0.33771532184950137, + "grad_norm": 1.871707356585933, + "learning_rate": 5.4034451495920225e-05, + "loss": 0.9371, + "step": 745 + }, + { + "epoch": 0.3381686310063463, + "grad_norm": 1.4830158478531945, + "learning_rate": 5.4106980961015414e-05, + "loss": 0.9522, + "step": 746 + }, + { + "epoch": 0.3386219401631913, + "grad_norm": 1.6566586212983272, + "learning_rate": 5.4179510426110616e-05, + "loss": 0.9524, + "step": 747 + }, + { + "epoch": 0.33907524932003624, + "grad_norm": 0.9045872877310464, + "learning_rate": 5.4252039891205805e-05, + "loss": 0.9213, + "step": 748 + }, + { + "epoch": 0.33952855847688124, + "grad_norm": 2.151902261591363, + "learning_rate": 5.4324569356301e-05, + "loss": 0.9256, + "step": 749 + }, + { + "epoch": 0.3399818676337262, + "grad_norm": 1.4616099276475119, + "learning_rate": 5.439709882139619e-05, + "loss": 0.9431, + "step": 750 + }, + { + "epoch": 0.34043517679057117, + "grad_norm": 1.5446030109830196, + "learning_rate": 5.446962828649139e-05, + "loss": 0.9301, + "step": 751 + }, + { + "epoch": 0.34088848594741616, + "grad_norm": 1.7241557764299809, + "learning_rate": 5.454215775158659e-05, + "loss": 0.9531, + "step": 752 + }, + { + "epoch": 0.3413417951042611, + "grad_norm": 1.2094845365091573, + "learning_rate": 5.4614687216681784e-05, + "loss": 0.9412, + "step": 753 + }, + { + "epoch": 0.3417951042611061, + "grad_norm": 1.5959627438521686, + "learning_rate": 5.468721668177698e-05, + "loss": 0.9491, + "step": 754 + }, + { + "epoch": 0.34224841341795104, + "grad_norm": 1.6644343552917442, + "learning_rate": 5.475974614687217e-05, + "loss": 0.933, + "step": 755 + }, + { + "epoch": 0.34270172257479603, + "grad_norm": 1.3011371533464957, + "learning_rate": 5.483227561196737e-05, + "loss": 0.9344, + "step": 756 + }, + { + "epoch": 0.34315503173164097, + "grad_norm": 1.1195681959219173, + "learning_rate": 5.490480507706256e-05, + "loss": 0.9609, + "step": 757 + }, + { + "epoch": 0.34360834088848596, + "grad_norm": 1.8164413707936073, + "learning_rate": 5.4977334542157755e-05, + "loss": 0.9417, + "step": 758 + }, + { + "epoch": 0.3440616500453309, + "grad_norm": 1.1700058888793645, + "learning_rate": 5.504986400725295e-05, + "loss": 0.9279, + "step": 759 + }, + { + "epoch": 0.3445149592021759, + "grad_norm": 1.8051358340955026, + "learning_rate": 5.512239347234815e-05, + "loss": 0.9457, + "step": 760 + }, + { + "epoch": 0.34496826835902084, + "grad_norm": 1.6711570478921167, + "learning_rate": 5.5194922937443336e-05, + "loss": 0.9456, + "step": 761 + }, + { + "epoch": 0.34542157751586583, + "grad_norm": 1.3647272668717862, + "learning_rate": 5.526745240253854e-05, + "loss": 0.9558, + "step": 762 + }, + { + "epoch": 0.34587488667271077, + "grad_norm": 1.9918458135029675, + "learning_rate": 5.533998186763373e-05, + "loss": 0.9307, + "step": 763 + }, + { + "epoch": 0.34632819582955576, + "grad_norm": 1.5588487059053593, + "learning_rate": 5.541251133272893e-05, + "loss": 0.9268, + "step": 764 + }, + { + "epoch": 0.3467815049864007, + "grad_norm": 1.703386531274259, + "learning_rate": 5.548504079782412e-05, + "loss": 0.9188, + "step": 765 + }, + { + "epoch": 0.3472348141432457, + "grad_norm": 1.2934814450755638, + "learning_rate": 5.5557570262919314e-05, + "loss": 0.9598, + "step": 766 + }, + { + "epoch": 0.34768812330009063, + "grad_norm": 1.2928405130100107, + "learning_rate": 5.56300997280145e-05, + "loss": 0.9512, + "step": 767 + }, + { + "epoch": 0.34814143245693563, + "grad_norm": 1.2435153039805282, + "learning_rate": 5.5702629193109706e-05, + "loss": 0.9217, + "step": 768 + }, + { + "epoch": 0.3485947416137806, + "grad_norm": 1.3857379328018322, + "learning_rate": 5.57751586582049e-05, + "loss": 0.9231, + "step": 769 + }, + { + "epoch": 0.34904805077062556, + "grad_norm": 1.9927241506804212, + "learning_rate": 5.58476881233001e-05, + "loss": 0.9254, + "step": 770 + }, + { + "epoch": 0.34950135992747056, + "grad_norm": 1.2773892745695101, + "learning_rate": 5.592021758839529e-05, + "loss": 0.9343, + "step": 771 + }, + { + "epoch": 0.3499546690843155, + "grad_norm": 1.1659303152428873, + "learning_rate": 5.599274705349048e-05, + "loss": 0.9351, + "step": 772 + }, + { + "epoch": 0.3504079782411605, + "grad_norm": 1.7074661165938192, + "learning_rate": 5.6065276518585684e-05, + "loss": 0.9458, + "step": 773 + }, + { + "epoch": 0.35086128739800543, + "grad_norm": 1.2843992359116825, + "learning_rate": 5.613780598368087e-05, + "loss": 0.9228, + "step": 774 + }, + { + "epoch": 0.3513145965548504, + "grad_norm": 1.7481393842284374, + "learning_rate": 5.6210335448776075e-05, + "loss": 0.9444, + "step": 775 + }, + { + "epoch": 0.35176790571169536, + "grad_norm": 1.4815623776744518, + "learning_rate": 5.6282864913871264e-05, + "loss": 0.9576, + "step": 776 + }, + { + "epoch": 0.35222121486854036, + "grad_norm": 1.2764403249645446, + "learning_rate": 5.635539437896646e-05, + "loss": 0.9382, + "step": 777 + }, + { + "epoch": 0.3526745240253853, + "grad_norm": 1.4924814269355533, + "learning_rate": 5.642792384406165e-05, + "loss": 0.9512, + "step": 778 + }, + { + "epoch": 0.3531278331822303, + "grad_norm": 1.402499178070738, + "learning_rate": 5.650045330915685e-05, + "loss": 0.9368, + "step": 779 + }, + { + "epoch": 0.3535811423390752, + "grad_norm": 1.237132969496704, + "learning_rate": 5.657298277425204e-05, + "loss": 0.9425, + "step": 780 + }, + { + "epoch": 0.3540344514959202, + "grad_norm": 1.9800239458870281, + "learning_rate": 5.664551223934724e-05, + "loss": 0.9427, + "step": 781 + }, + { + "epoch": 0.35448776065276516, + "grad_norm": 1.437096883657844, + "learning_rate": 5.671804170444243e-05, + "loss": 0.9463, + "step": 782 + }, + { + "epoch": 0.35494106980961015, + "grad_norm": 1.217399965920037, + "learning_rate": 5.679057116953763e-05, + "loss": 0.936, + "step": 783 + }, + { + "epoch": 0.35539437896645515, + "grad_norm": 1.413852336694359, + "learning_rate": 5.686310063463283e-05, + "loss": 0.9411, + "step": 784 + }, + { + "epoch": 0.3558476881233001, + "grad_norm": 1.8809740802111852, + "learning_rate": 5.693563009972802e-05, + "loss": 0.9372, + "step": 785 + }, + { + "epoch": 0.3563009972801451, + "grad_norm": 1.2321508482998587, + "learning_rate": 5.7008159564823215e-05, + "loss": 0.9542, + "step": 786 + }, + { + "epoch": 0.35675430643699, + "grad_norm": 1.7736862433223881, + "learning_rate": 5.708068902991841e-05, + "loss": 0.94, + "step": 787 + }, + { + "epoch": 0.357207615593835, + "grad_norm": 0.9134031731076103, + "learning_rate": 5.7153218495013606e-05, + "loss": 0.9347, + "step": 788 + }, + { + "epoch": 0.35766092475067995, + "grad_norm": 2.252952662307836, + "learning_rate": 5.7225747960108795e-05, + "loss": 0.952, + "step": 789 + }, + { + "epoch": 0.35811423390752495, + "grad_norm": 1.3692228101803525, + "learning_rate": 5.7298277425204e-05, + "loss": 0.9355, + "step": 790 + }, + { + "epoch": 0.3585675430643699, + "grad_norm": 2.189743158739513, + "learning_rate": 5.7370806890299186e-05, + "loss": 0.9489, + "step": 791 + }, + { + "epoch": 0.3590208522212149, + "grad_norm": 1.9551980650289926, + "learning_rate": 5.744333635539439e-05, + "loss": 0.9664, + "step": 792 + }, + { + "epoch": 0.3594741613780598, + "grad_norm": 1.7212370425969903, + "learning_rate": 5.751586582048958e-05, + "loss": 0.9241, + "step": 793 + }, + { + "epoch": 0.3599274705349048, + "grad_norm": 1.6201884833517672, + "learning_rate": 5.7588395285584773e-05, + "loss": 0.9431, + "step": 794 + }, + { + "epoch": 0.36038077969174975, + "grad_norm": 1.709171079486392, + "learning_rate": 5.766092475067996e-05, + "loss": 0.9317, + "step": 795 + }, + { + "epoch": 0.36083408884859475, + "grad_norm": 1.4776356790789862, + "learning_rate": 5.7733454215775165e-05, + "loss": 0.9312, + "step": 796 + }, + { + "epoch": 0.3612873980054397, + "grad_norm": 1.5880136309183253, + "learning_rate": 5.7805983680870354e-05, + "loss": 0.9222, + "step": 797 + }, + { + "epoch": 0.3617407071622847, + "grad_norm": 1.082206323201442, + "learning_rate": 5.7878513145965556e-05, + "loss": 0.9551, + "step": 798 + }, + { + "epoch": 0.3621940163191296, + "grad_norm": 1.864204107029482, + "learning_rate": 5.7951042611060745e-05, + "loss": 0.943, + "step": 799 + }, + { + "epoch": 0.3626473254759746, + "grad_norm": 1.4318810249564151, + "learning_rate": 5.802357207615594e-05, + "loss": 0.9451, + "step": 800 + }, + { + "epoch": 0.3631006346328196, + "grad_norm": 1.7663768843759915, + "learning_rate": 5.809610154125114e-05, + "loss": 0.9478, + "step": 801 + }, + { + "epoch": 0.36355394378966455, + "grad_norm": 1.714925029259117, + "learning_rate": 5.816863100634633e-05, + "loss": 0.9627, + "step": 802 + }, + { + "epoch": 0.36400725294650954, + "grad_norm": 1.4279510579227686, + "learning_rate": 5.8241160471441535e-05, + "loss": 0.9544, + "step": 803 + }, + { + "epoch": 0.3644605621033545, + "grad_norm": 1.2960248997622423, + "learning_rate": 5.8313689936536724e-05, + "loss": 0.9186, + "step": 804 + }, + { + "epoch": 0.3649138712601995, + "grad_norm": 1.6499717232077822, + "learning_rate": 5.838621940163192e-05, + "loss": 0.9447, + "step": 805 + }, + { + "epoch": 0.3653671804170444, + "grad_norm": 1.1813548839874675, + "learning_rate": 5.845874886672711e-05, + "loss": 0.9348, + "step": 806 + }, + { + "epoch": 0.3658204895738894, + "grad_norm": 1.7394480002465147, + "learning_rate": 5.853127833182231e-05, + "loss": 0.9454, + "step": 807 + }, + { + "epoch": 0.36627379873073435, + "grad_norm": 1.312484754311586, + "learning_rate": 5.86038077969175e-05, + "loss": 0.9315, + "step": 808 + }, + { + "epoch": 0.36672710788757934, + "grad_norm": 1.6765711002743922, + "learning_rate": 5.86763372620127e-05, + "loss": 0.9535, + "step": 809 + }, + { + "epoch": 0.3671804170444243, + "grad_norm": 1.381483561502983, + "learning_rate": 5.874886672710789e-05, + "loss": 0.9387, + "step": 810 + }, + { + "epoch": 0.3676337262012693, + "grad_norm": 1.502672669938309, + "learning_rate": 5.882139619220309e-05, + "loss": 0.9515, + "step": 811 + }, + { + "epoch": 0.3680870353581142, + "grad_norm": 1.8710689693066422, + "learning_rate": 5.8893925657298276e-05, + "loss": 0.9355, + "step": 812 + }, + { + "epoch": 0.3685403445149592, + "grad_norm": 1.1328187776829437, + "learning_rate": 5.896645512239348e-05, + "loss": 0.944, + "step": 813 + }, + { + "epoch": 0.36899365367180414, + "grad_norm": 1.3399127517108294, + "learning_rate": 5.903898458748867e-05, + "loss": 0.9397, + "step": 814 + }, + { + "epoch": 0.36944696282864914, + "grad_norm": 2.009219902389139, + "learning_rate": 5.911151405258387e-05, + "loss": 0.934, + "step": 815 + }, + { + "epoch": 0.36990027198549413, + "grad_norm": 1.4688141732459803, + "learning_rate": 5.9184043517679065e-05, + "loss": 0.9458, + "step": 816 + }, + { + "epoch": 0.37035358114233907, + "grad_norm": 1.3381098366428306, + "learning_rate": 5.9256572982774254e-05, + "loss": 0.9442, + "step": 817 + }, + { + "epoch": 0.37080689029918407, + "grad_norm": 1.3616784643936284, + "learning_rate": 5.9329102447869457e-05, + "loss": 0.944, + "step": 818 + }, + { + "epoch": 0.371260199456029, + "grad_norm": 1.4096871244479137, + "learning_rate": 5.9401631912964645e-05, + "loss": 0.94, + "step": 819 + }, + { + "epoch": 0.371713508612874, + "grad_norm": 1.7074436844594758, + "learning_rate": 5.947416137805985e-05, + "loss": 0.9417, + "step": 820 + }, + { + "epoch": 0.37216681776971894, + "grad_norm": 1.7678346549776505, + "learning_rate": 5.954669084315504e-05, + "loss": 0.9286, + "step": 821 + }, + { + "epoch": 0.37262012692656393, + "grad_norm": 1.1637863567745292, + "learning_rate": 5.961922030825023e-05, + "loss": 0.9511, + "step": 822 + }, + { + "epoch": 0.37307343608340887, + "grad_norm": 1.7514825295503125, + "learning_rate": 5.969174977334542e-05, + "loss": 0.9473, + "step": 823 + }, + { + "epoch": 0.37352674524025387, + "grad_norm": 1.2437716011129283, + "learning_rate": 5.9764279238440624e-05, + "loss": 0.9224, + "step": 824 + }, + { + "epoch": 0.3739800543970988, + "grad_norm": 1.7196545486493893, + "learning_rate": 5.983680870353581e-05, + "loss": 0.9404, + "step": 825 + }, + { + "epoch": 0.3744333635539438, + "grad_norm": 1.8427541021855762, + "learning_rate": 5.9909338168631015e-05, + "loss": 0.9276, + "step": 826 + }, + { + "epoch": 0.37488667271078874, + "grad_norm": 1.1210243295638511, + "learning_rate": 5.9981867633726204e-05, + "loss": 0.9356, + "step": 827 + }, + { + "epoch": 0.37533998186763373, + "grad_norm": 1.424003799396712, + "learning_rate": 6.00543970988214e-05, + "loss": 0.9593, + "step": 828 + }, + { + "epoch": 0.37579329102447867, + "grad_norm": 1.1238202116507956, + "learning_rate": 6.012692656391659e-05, + "loss": 0.9162, + "step": 829 + }, + { + "epoch": 0.37624660018132366, + "grad_norm": 2.026538280877823, + "learning_rate": 6.019945602901179e-05, + "loss": 0.9294, + "step": 830 + }, + { + "epoch": 0.3766999093381686, + "grad_norm": 1.2014594325435937, + "learning_rate": 6.027198549410698e-05, + "loss": 0.9257, + "step": 831 + }, + { + "epoch": 0.3771532184950136, + "grad_norm": 2.0270748289882388, + "learning_rate": 6.034451495920218e-05, + "loss": 0.9555, + "step": 832 + }, + { + "epoch": 0.3776065276518586, + "grad_norm": 1.2228762637781396, + "learning_rate": 6.041704442429738e-05, + "loss": 0.9481, + "step": 833 + }, + { + "epoch": 0.37805983680870353, + "grad_norm": 1.4314068051907045, + "learning_rate": 6.048957388939257e-05, + "loss": 0.9441, + "step": 834 + }, + { + "epoch": 0.3785131459655485, + "grad_norm": 1.3228392319424185, + "learning_rate": 6.056210335448777e-05, + "loss": 0.9312, + "step": 835 + }, + { + "epoch": 0.37896645512239346, + "grad_norm": 1.2413311415999304, + "learning_rate": 6.063463281958296e-05, + "loss": 0.9141, + "step": 836 + }, + { + "epoch": 0.37941976427923846, + "grad_norm": 1.3113889397002814, + "learning_rate": 6.070716228467816e-05, + "loss": 0.9445, + "step": 837 + }, + { + "epoch": 0.3798730734360834, + "grad_norm": 2.017958389580416, + "learning_rate": 6.077969174977335e-05, + "loss": 0.9346, + "step": 838 + }, + { + "epoch": 0.3803263825929284, + "grad_norm": 1.2966653428498385, + "learning_rate": 6.0852221214868546e-05, + "loss": 0.9323, + "step": 839 + }, + { + "epoch": 0.38077969174977333, + "grad_norm": 1.175181285226416, + "learning_rate": 6.0924750679963735e-05, + "loss": 0.9346, + "step": 840 + }, + { + "epoch": 0.3812330009066183, + "grad_norm": 2.021480541116888, + "learning_rate": 6.099728014505894e-05, + "loss": 0.9446, + "step": 841 + }, + { + "epoch": 0.38168631006346326, + "grad_norm": 1.5891051212701277, + "learning_rate": 6.106980961015413e-05, + "loss": 0.9231, + "step": 842 + }, + { + "epoch": 0.38213961922030826, + "grad_norm": 1.064099475896568, + "learning_rate": 6.114233907524933e-05, + "loss": 0.9452, + "step": 843 + }, + { + "epoch": 0.3825929283771532, + "grad_norm": 2.7015999552489935, + "learning_rate": 6.121486854034452e-05, + "loss": 0.9431, + "step": 844 + }, + { + "epoch": 0.3830462375339982, + "grad_norm": 1.7418737344370814, + "learning_rate": 6.128739800543972e-05, + "loss": 0.9238, + "step": 845 + }, + { + "epoch": 0.38349954669084313, + "grad_norm": 2.627607625135701, + "learning_rate": 6.135992747053491e-05, + "loss": 0.9571, + "step": 846 + }, + { + "epoch": 0.3839528558476881, + "grad_norm": 2.54241893808046, + "learning_rate": 6.143245693563011e-05, + "loss": 0.9514, + "step": 847 + }, + { + "epoch": 0.3844061650045331, + "grad_norm": 1.8302121868747863, + "learning_rate": 6.15049864007253e-05, + "loss": 0.933, + "step": 848 + }, + { + "epoch": 0.38485947416137806, + "grad_norm": 1.6333171259177501, + "learning_rate": 6.157751586582049e-05, + "loss": 0.9463, + "step": 849 + }, + { + "epoch": 0.38531278331822305, + "grad_norm": 1.914548102894475, + "learning_rate": 6.165004533091569e-05, + "loss": 0.9507, + "step": 850 + }, + { + "epoch": 0.385766092475068, + "grad_norm": 1.3907372666515196, + "learning_rate": 6.172257479601088e-05, + "loss": 0.9486, + "step": 851 + }, + { + "epoch": 0.386219401631913, + "grad_norm": 2.024963232012732, + "learning_rate": 6.179510426110608e-05, + "loss": 0.9523, + "step": 852 + }, + { + "epoch": 0.3866727107887579, + "grad_norm": 1.2764898812678038, + "learning_rate": 6.186763372620127e-05, + "loss": 0.9403, + "step": 853 + }, + { + "epoch": 0.3871260199456029, + "grad_norm": 1.9531234990880517, + "learning_rate": 6.194016319129647e-05, + "loss": 0.9407, + "step": 854 + }, + { + "epoch": 0.38757932910244786, + "grad_norm": 1.5240135873469607, + "learning_rate": 6.201269265639166e-05, + "loss": 0.9375, + "step": 855 + }, + { + "epoch": 0.38803263825929285, + "grad_norm": 1.8581356343156015, + "learning_rate": 6.208522212148687e-05, + "loss": 0.9165, + "step": 856 + }, + { + "epoch": 0.3884859474161378, + "grad_norm": 1.725383299895511, + "learning_rate": 6.215775158658205e-05, + "loss": 0.9106, + "step": 857 + }, + { + "epoch": 0.3889392565729828, + "grad_norm": 1.58018603371685, + "learning_rate": 6.223028105167724e-05, + "loss": 0.9226, + "step": 858 + }, + { + "epoch": 0.3893925657298277, + "grad_norm": 1.4709829394769822, + "learning_rate": 6.230281051677245e-05, + "loss": 0.9524, + "step": 859 + }, + { + "epoch": 0.3898458748866727, + "grad_norm": 1.5012042692618373, + "learning_rate": 6.237533998186764e-05, + "loss": 0.9279, + "step": 860 + }, + { + "epoch": 0.39029918404351766, + "grad_norm": 1.222271787969785, + "learning_rate": 6.244786944696282e-05, + "loss": 0.9202, + "step": 861 + }, + { + "epoch": 0.39075249320036265, + "grad_norm": 1.4335192520689135, + "learning_rate": 6.252039891205803e-05, + "loss": 0.9341, + "step": 862 + }, + { + "epoch": 0.3912058023572076, + "grad_norm": 1.6968967796136794, + "learning_rate": 6.259292837715322e-05, + "loss": 0.938, + "step": 863 + }, + { + "epoch": 0.3916591115140526, + "grad_norm": 2.033955446949894, + "learning_rate": 6.266545784224842e-05, + "loss": 0.9264, + "step": 864 + }, + { + "epoch": 0.3921124206708976, + "grad_norm": 1.1319894877171877, + "learning_rate": 6.273798730734362e-05, + "loss": 0.91, + "step": 865 + }, + { + "epoch": 0.3925657298277425, + "grad_norm": 1.446720954686739, + "learning_rate": 6.281051677243881e-05, + "loss": 0.9597, + "step": 866 + }, + { + "epoch": 0.3930190389845875, + "grad_norm": 1.1355051979362119, + "learning_rate": 6.288304623753401e-05, + "loss": 0.9239, + "step": 867 + }, + { + "epoch": 0.39347234814143245, + "grad_norm": 1.73059005014981, + "learning_rate": 6.29555757026292e-05, + "loss": 0.9549, + "step": 868 + }, + { + "epoch": 0.39392565729827744, + "grad_norm": 1.35999332508219, + "learning_rate": 6.302810516772439e-05, + "loss": 0.943, + "step": 869 + }, + { + "epoch": 0.3943789664551224, + "grad_norm": 1.4683590015016095, + "learning_rate": 6.310063463281959e-05, + "loss": 0.9381, + "step": 870 + }, + { + "epoch": 0.3948322756119674, + "grad_norm": 1.179008047372574, + "learning_rate": 6.317316409791478e-05, + "loss": 0.9174, + "step": 871 + }, + { + "epoch": 0.3952855847688123, + "grad_norm": 1.7643075715157752, + "learning_rate": 6.324569356300997e-05, + "loss": 0.9233, + "step": 872 + }, + { + "epoch": 0.3957388939256573, + "grad_norm": 1.387620600407311, + "learning_rate": 6.331822302810517e-05, + "loss": 0.943, + "step": 873 + }, + { + "epoch": 0.39619220308250225, + "grad_norm": 1.6175734965492337, + "learning_rate": 6.339075249320036e-05, + "loss": 0.9374, + "step": 874 + }, + { + "epoch": 0.39664551223934724, + "grad_norm": 2.29583331129652, + "learning_rate": 6.346328195829556e-05, + "loss": 0.9313, + "step": 875 + }, + { + "epoch": 0.3970988213961922, + "grad_norm": 0.9501061734503268, + "learning_rate": 6.353581142339075e-05, + "loss": 0.9396, + "step": 876 + }, + { + "epoch": 0.3975521305530372, + "grad_norm": 2.3376966689093863, + "learning_rate": 6.360834088848596e-05, + "loss": 0.9267, + "step": 877 + }, + { + "epoch": 0.3980054397098821, + "grad_norm": 1.6714058145697481, + "learning_rate": 6.368087035358114e-05, + "loss": 0.9337, + "step": 878 + }, + { + "epoch": 0.3984587488667271, + "grad_norm": 1.5636515921010241, + "learning_rate": 6.375339981867635e-05, + "loss": 0.9301, + "step": 879 + }, + { + "epoch": 0.3989120580235721, + "grad_norm": 1.589175417373944, + "learning_rate": 6.382592928377154e-05, + "loss": 0.9354, + "step": 880 + }, + { + "epoch": 0.39936536718041704, + "grad_norm": 1.4358513671945847, + "learning_rate": 6.389845874886674e-05, + "loss": 0.9318, + "step": 881 + }, + { + "epoch": 0.39981867633726204, + "grad_norm": 2.0755915247955485, + "learning_rate": 6.397098821396193e-05, + "loss": 0.9283, + "step": 882 + }, + { + "epoch": 0.400271985494107, + "grad_norm": 1.5438756859296723, + "learning_rate": 6.404351767905712e-05, + "loss": 0.9404, + "step": 883 + }, + { + "epoch": 0.40072529465095197, + "grad_norm": 1.1137385313055184, + "learning_rate": 6.411604714415232e-05, + "loss": 0.9395, + "step": 884 + }, + { + "epoch": 0.4011786038077969, + "grad_norm": 1.9809110569943833, + "learning_rate": 6.418857660924751e-05, + "loss": 0.949, + "step": 885 + }, + { + "epoch": 0.4016319129646419, + "grad_norm": 1.1910777816308293, + "learning_rate": 6.426110607434271e-05, + "loss": 0.9313, + "step": 886 + }, + { + "epoch": 0.40208522212148684, + "grad_norm": 2.1660806740082315, + "learning_rate": 6.43336355394379e-05, + "loss": 0.9405, + "step": 887 + }, + { + "epoch": 0.40253853127833183, + "grad_norm": 1.3323787517285712, + "learning_rate": 6.44061650045331e-05, + "loss": 0.9421, + "step": 888 + }, + { + "epoch": 0.4029918404351768, + "grad_norm": 1.8324767131959123, + "learning_rate": 6.447869446962829e-05, + "loss": 0.9393, + "step": 889 + }, + { + "epoch": 0.40344514959202177, + "grad_norm": 1.465930308979378, + "learning_rate": 6.455122393472349e-05, + "loss": 0.9481, + "step": 890 + }, + { + "epoch": 0.4038984587488667, + "grad_norm": 1.773266930159506, + "learning_rate": 6.462375339981868e-05, + "loss": 0.9232, + "step": 891 + }, + { + "epoch": 0.4043517679057117, + "grad_norm": 1.4240441690792642, + "learning_rate": 6.469628286491388e-05, + "loss": 0.9281, + "step": 892 + }, + { + "epoch": 0.40480507706255664, + "grad_norm": 1.2483571738783068, + "learning_rate": 6.476881233000907e-05, + "loss": 0.9315, + "step": 893 + }, + { + "epoch": 0.40525838621940163, + "grad_norm": 2.3218767888054117, + "learning_rate": 6.484134179510426e-05, + "loss": 0.955, + "step": 894 + }, + { + "epoch": 0.4057116953762466, + "grad_norm": 1.5962884848038854, + "learning_rate": 6.491387126019945e-05, + "loss": 0.9453, + "step": 895 + }, + { + "epoch": 0.40616500453309157, + "grad_norm": 1.9981418057687077, + "learning_rate": 6.498640072529465e-05, + "loss": 0.9514, + "step": 896 + }, + { + "epoch": 0.40661831368993656, + "grad_norm": 1.9826590766827636, + "learning_rate": 6.505893019038986e-05, + "loss": 0.9407, + "step": 897 + }, + { + "epoch": 0.4070716228467815, + "grad_norm": 1.651000508081076, + "learning_rate": 6.513145965548504e-05, + "loss": 0.945, + "step": 898 + }, + { + "epoch": 0.4075249320036265, + "grad_norm": 1.08598493225144, + "learning_rate": 6.520398912058025e-05, + "loss": 0.9412, + "step": 899 + }, + { + "epoch": 0.40797824116047143, + "grad_norm": 1.750476079696457, + "learning_rate": 6.527651858567544e-05, + "loss": 0.9472, + "step": 900 + }, + { + "epoch": 0.4084315503173164, + "grad_norm": 1.4679979289095233, + "learning_rate": 6.534904805077064e-05, + "loss": 0.947, + "step": 901 + }, + { + "epoch": 0.40888485947416137, + "grad_norm": 1.9173282776325855, + "learning_rate": 6.542157751586583e-05, + "loss": 0.9255, + "step": 902 + }, + { + "epoch": 0.40933816863100636, + "grad_norm": 1.4773057222190584, + "learning_rate": 6.549410698096103e-05, + "loss": 0.9321, + "step": 903 + }, + { + "epoch": 0.4097914777878513, + "grad_norm": 1.6709503751089287, + "learning_rate": 6.556663644605622e-05, + "loss": 0.9573, + "step": 904 + }, + { + "epoch": 0.4102447869446963, + "grad_norm": 1.6020648585906692, + "learning_rate": 6.563916591115141e-05, + "loss": 0.919, + "step": 905 + }, + { + "epoch": 0.41069809610154123, + "grad_norm": 1.487934521806381, + "learning_rate": 6.57116953762466e-05, + "loss": 0.9116, + "step": 906 + }, + { + "epoch": 0.4111514052583862, + "grad_norm": 1.3687883772116523, + "learning_rate": 6.57842248413418e-05, + "loss": 0.926, + "step": 907 + }, + { + "epoch": 0.41160471441523117, + "grad_norm": 1.833150663003179, + "learning_rate": 6.585675430643699e-05, + "loss": 0.9363, + "step": 908 + }, + { + "epoch": 0.41205802357207616, + "grad_norm": 1.5368831743113036, + "learning_rate": 6.592928377153219e-05, + "loss": 0.9148, + "step": 909 + }, + { + "epoch": 0.4125113327289211, + "grad_norm": 1.513761134710497, + "learning_rate": 6.600181323662738e-05, + "loss": 0.9261, + "step": 910 + }, + { + "epoch": 0.4129646418857661, + "grad_norm": 1.6763655748813047, + "learning_rate": 6.607434270172258e-05, + "loss": 0.9169, + "step": 911 + }, + { + "epoch": 0.4134179510426111, + "grad_norm": 1.4085450863245177, + "learning_rate": 6.614687216681778e-05, + "loss": 0.9159, + "step": 912 + }, + { + "epoch": 0.413871260199456, + "grad_norm": 1.5639693571864681, + "learning_rate": 6.621940163191297e-05, + "loss": 0.916, + "step": 913 + }, + { + "epoch": 0.414324569356301, + "grad_norm": 1.7572519553089596, + "learning_rate": 6.629193109700816e-05, + "loss": 0.9535, + "step": 914 + }, + { + "epoch": 0.41477787851314596, + "grad_norm": 2.0709414092633525, + "learning_rate": 6.636446056210336e-05, + "loss": 0.9392, + "step": 915 + }, + { + "epoch": 0.41523118766999095, + "grad_norm": 0.7585721348321177, + "learning_rate": 6.643699002719855e-05, + "loss": 0.8992, + "step": 916 + }, + { + "epoch": 0.4156844968268359, + "grad_norm": 2.063707486862334, + "learning_rate": 6.650951949229374e-05, + "loss": 0.9389, + "step": 917 + }, + { + "epoch": 0.4161378059836809, + "grad_norm": 1.8206549581627278, + "learning_rate": 6.658204895738894e-05, + "loss": 0.939, + "step": 918 + }, + { + "epoch": 0.4165911151405258, + "grad_norm": 1.1900306558083373, + "learning_rate": 6.665457842248413e-05, + "loss": 0.9314, + "step": 919 + }, + { + "epoch": 0.4170444242973708, + "grad_norm": 2.187042650518004, + "learning_rate": 6.672710788757934e-05, + "loss": 0.9388, + "step": 920 + }, + { + "epoch": 0.41749773345421576, + "grad_norm": 1.3145749436236447, + "learning_rate": 6.679963735267453e-05, + "loss": 0.9323, + "step": 921 + }, + { + "epoch": 0.41795104261106075, + "grad_norm": 1.609851220728418, + "learning_rate": 6.687216681776973e-05, + "loss": 0.9334, + "step": 922 + }, + { + "epoch": 0.4184043517679057, + "grad_norm": 1.7458796730496562, + "learning_rate": 6.694469628286492e-05, + "loss": 0.9349, + "step": 923 + }, + { + "epoch": 0.4188576609247507, + "grad_norm": 1.2191972068500994, + "learning_rate": 6.701722574796012e-05, + "loss": 0.9386, + "step": 924 + }, + { + "epoch": 0.4193109700815956, + "grad_norm": 2.6586821050446847, + "learning_rate": 6.708975521305531e-05, + "loss": 0.9669, + "step": 925 + }, + { + "epoch": 0.4197642792384406, + "grad_norm": 1.3176141361787714, + "learning_rate": 6.716228467815051e-05, + "loss": 0.9209, + "step": 926 + }, + { + "epoch": 0.42021758839528556, + "grad_norm": 2.8726390473026027, + "learning_rate": 6.72348141432457e-05, + "loss": 0.9523, + "step": 927 + }, + { + "epoch": 0.42067089755213055, + "grad_norm": 2.139370711372781, + "learning_rate": 6.730734360834089e-05, + "loss": 0.9492, + "step": 928 + }, + { + "epoch": 0.42112420670897555, + "grad_norm": 1.7433666846502294, + "learning_rate": 6.737987307343609e-05, + "loss": 0.951, + "step": 929 + }, + { + "epoch": 0.4215775158658205, + "grad_norm": 1.7675896957281527, + "learning_rate": 6.745240253853128e-05, + "loss": 0.9316, + "step": 930 + }, + { + "epoch": 0.4220308250226655, + "grad_norm": 1.855870845526705, + "learning_rate": 6.752493200362648e-05, + "loss": 0.9625, + "step": 931 + }, + { + "epoch": 0.4224841341795104, + "grad_norm": 1.4116875109813614, + "learning_rate": 6.759746146872167e-05, + "loss": 0.9318, + "step": 932 + }, + { + "epoch": 0.4229374433363554, + "grad_norm": 1.473417657477488, + "learning_rate": 6.766999093381687e-05, + "loss": 0.932, + "step": 933 + }, + { + "epoch": 0.42339075249320035, + "grad_norm": 1.9827973039569222, + "learning_rate": 6.774252039891206e-05, + "loss": 0.9521, + "step": 934 + }, + { + "epoch": 0.42384406165004535, + "grad_norm": 1.490915646142894, + "learning_rate": 6.781504986400727e-05, + "loss": 0.9339, + "step": 935 + }, + { + "epoch": 0.4242973708068903, + "grad_norm": 1.4891955713786058, + "learning_rate": 6.788757932910245e-05, + "loss": 0.9397, + "step": 936 + }, + { + "epoch": 0.4247506799637353, + "grad_norm": 1.3194849777903461, + "learning_rate": 6.796010879419766e-05, + "loss": 0.9414, + "step": 937 + }, + { + "epoch": 0.4252039891205802, + "grad_norm": 1.502342131945211, + "learning_rate": 6.803263825929285e-05, + "loss": 0.9439, + "step": 938 + }, + { + "epoch": 0.4256572982774252, + "grad_norm": 1.774141083835349, + "learning_rate": 6.810516772438803e-05, + "loss": 0.9257, + "step": 939 + }, + { + "epoch": 0.42611060743427015, + "grad_norm": 1.7455053940178245, + "learning_rate": 6.817769718948322e-05, + "loss": 0.9387, + "step": 940 + }, + { + "epoch": 0.42656391659111514, + "grad_norm": 1.129544706531614, + "learning_rate": 6.825022665457843e-05, + "loss": 0.9403, + "step": 941 + }, + { + "epoch": 0.4270172257479601, + "grad_norm": 1.732948916377446, + "learning_rate": 6.832275611967361e-05, + "loss": 0.9193, + "step": 942 + }, + { + "epoch": 0.4274705349048051, + "grad_norm": 1.0596313411874538, + "learning_rate": 6.839528558476882e-05, + "loss": 0.9328, + "step": 943 + }, + { + "epoch": 0.42792384406165007, + "grad_norm": 2.510228751742774, + "learning_rate": 6.846781504986402e-05, + "loss": 0.9321, + "step": 944 + }, + { + "epoch": 0.428377153218495, + "grad_norm": 1.5354028712389074, + "learning_rate": 6.854034451495921e-05, + "loss": 0.933, + "step": 945 + }, + { + "epoch": 0.42883046237534, + "grad_norm": 2.10013097564691, + "learning_rate": 6.861287398005441e-05, + "loss": 0.9582, + "step": 946 + }, + { + "epoch": 0.42928377153218494, + "grad_norm": 1.9929757857944115, + "learning_rate": 6.86854034451496e-05, + "loss": 0.9406, + "step": 947 + }, + { + "epoch": 0.42973708068902994, + "grad_norm": 1.963260940250398, + "learning_rate": 6.87579329102448e-05, + "loss": 0.941, + "step": 948 + }, + { + "epoch": 0.4301903898458749, + "grad_norm": 1.4080118567743063, + "learning_rate": 6.883046237533999e-05, + "loss": 0.9593, + "step": 949 + }, + { + "epoch": 0.43064369900271987, + "grad_norm": 2.7590175421937264, + "learning_rate": 6.890299184043518e-05, + "loss": 0.9396, + "step": 950 + }, + { + "epoch": 0.4310970081595648, + "grad_norm": 2.320743470688124, + "learning_rate": 6.897552130553037e-05, + "loss": 0.9304, + "step": 951 + }, + { + "epoch": 0.4315503173164098, + "grad_norm": 1.9443881398778053, + "learning_rate": 6.904805077062557e-05, + "loss": 0.9351, + "step": 952 + }, + { + "epoch": 0.43200362647325474, + "grad_norm": 2.468051713952071, + "learning_rate": 6.912058023572076e-05, + "loss": 0.9404, + "step": 953 + }, + { + "epoch": 0.43245693563009974, + "grad_norm": 1.2599390167703235, + "learning_rate": 6.919310970081596e-05, + "loss": 0.947, + "step": 954 + }, + { + "epoch": 0.4329102447869447, + "grad_norm": 3.5081738768640727, + "learning_rate": 6.926563916591115e-05, + "loss": 0.9288, + "step": 955 + }, + { + "epoch": 0.43336355394378967, + "grad_norm": 2.2382158006370503, + "learning_rate": 6.933816863100635e-05, + "loss": 0.9456, + "step": 956 + }, + { + "epoch": 0.4338168631006346, + "grad_norm": 3.705209928534828, + "learning_rate": 6.941069809610154e-05, + "loss": 0.9513, + "step": 957 + }, + { + "epoch": 0.4342701722574796, + "grad_norm": 2.9758727994960004, + "learning_rate": 6.948322756119675e-05, + "loss": 0.9585, + "step": 958 + }, + { + "epoch": 0.43472348141432454, + "grad_norm": 2.9047355279969183, + "learning_rate": 6.955575702629193e-05, + "loss": 0.9442, + "step": 959 + }, + { + "epoch": 0.43517679057116954, + "grad_norm": 3.4944956515289642, + "learning_rate": 6.962828649138714e-05, + "loss": 0.9593, + "step": 960 + }, + { + "epoch": 0.43563009972801453, + "grad_norm": 1.9226738845289713, + "learning_rate": 6.970081595648233e-05, + "loss": 0.9415, + "step": 961 + }, + { + "epoch": 0.43608340888485947, + "grad_norm": 1.6124738141894455, + "learning_rate": 6.977334542157751e-05, + "loss": 0.9145, + "step": 962 + }, + { + "epoch": 0.43653671804170446, + "grad_norm": 1.970471438700279, + "learning_rate": 6.984587488667272e-05, + "loss": 0.9516, + "step": 963 + }, + { + "epoch": 0.4369900271985494, + "grad_norm": 1.5307380655043137, + "learning_rate": 6.99184043517679e-05, + "loss": 0.9313, + "step": 964 + }, + { + "epoch": 0.4374433363553944, + "grad_norm": 2.025095875444784, + "learning_rate": 6.999093381686311e-05, + "loss": 0.9345, + "step": 965 + }, + { + "epoch": 0.43789664551223934, + "grad_norm": 1.2270479229092894, + "learning_rate": 7.00634632819583e-05, + "loss": 0.9428, + "step": 966 + }, + { + "epoch": 0.43834995466908433, + "grad_norm": 2.1293985378337625, + "learning_rate": 7.01359927470535e-05, + "loss": 0.9093, + "step": 967 + }, + { + "epoch": 0.43880326382592927, + "grad_norm": 1.7044946143740973, + "learning_rate": 7.020852221214869e-05, + "loss": 0.9276, + "step": 968 + }, + { + "epoch": 0.43925657298277426, + "grad_norm": 1.7963157746965182, + "learning_rate": 7.028105167724389e-05, + "loss": 0.9396, + "step": 969 + }, + { + "epoch": 0.4397098821396192, + "grad_norm": 1.8697551917468913, + "learning_rate": 7.035358114233908e-05, + "loss": 0.923, + "step": 970 + }, + { + "epoch": 0.4401631912964642, + "grad_norm": 1.328352473910411, + "learning_rate": 7.042611060743428e-05, + "loss": 0.9222, + "step": 971 + }, + { + "epoch": 0.44061650045330913, + "grad_norm": 1.478687393293392, + "learning_rate": 7.049864007252947e-05, + "loss": 0.9366, + "step": 972 + }, + { + "epoch": 0.44106980961015413, + "grad_norm": 1.2856007597418637, + "learning_rate": 7.057116953762466e-05, + "loss": 0.9094, + "step": 973 + }, + { + "epoch": 0.44152311876699907, + "grad_norm": 1.7851810947990185, + "learning_rate": 7.064369900271986e-05, + "loss": 0.9502, + "step": 974 + }, + { + "epoch": 0.44197642792384406, + "grad_norm": 1.0345446187242393, + "learning_rate": 7.071622846781505e-05, + "loss": 0.9339, + "step": 975 + }, + { + "epoch": 0.44242973708068906, + "grad_norm": 2.0394740698837945, + "learning_rate": 7.078875793291025e-05, + "loss": 0.9386, + "step": 976 + }, + { + "epoch": 0.442883046237534, + "grad_norm": 1.4928779330991742, + "learning_rate": 7.086128739800544e-05, + "loss": 0.9379, + "step": 977 + }, + { + "epoch": 0.443336355394379, + "grad_norm": 1.9033789297205554, + "learning_rate": 7.093381686310065e-05, + "loss": 0.9333, + "step": 978 + }, + { + "epoch": 0.44378966455122393, + "grad_norm": 1.4671009456033464, + "learning_rate": 7.100634632819584e-05, + "loss": 0.9476, + "step": 979 + }, + { + "epoch": 0.4442429737080689, + "grad_norm": 2.1270428907674583, + "learning_rate": 7.107887579329104e-05, + "loss": 0.9523, + "step": 980 + }, + { + "epoch": 0.44469628286491386, + "grad_norm": 1.8338154927689072, + "learning_rate": 7.115140525838623e-05, + "loss": 0.9733, + "step": 981 + }, + { + "epoch": 0.44514959202175886, + "grad_norm": 1.5092680025813463, + "learning_rate": 7.122393472348143e-05, + "loss": 0.9506, + "step": 982 + }, + { + "epoch": 0.4456029011786038, + "grad_norm": 1.5110762059552627, + "learning_rate": 7.129646418857662e-05, + "loss": 0.9435, + "step": 983 + }, + { + "epoch": 0.4460562103354488, + "grad_norm": 1.4695380475526965, + "learning_rate": 7.13689936536718e-05, + "loss": 0.9302, + "step": 984 + }, + { + "epoch": 0.4465095194922937, + "grad_norm": 1.4501562669917534, + "learning_rate": 7.1441523118767e-05, + "loss": 0.9055, + "step": 985 + }, + { + "epoch": 0.4469628286491387, + "grad_norm": 1.9374464873842485, + "learning_rate": 7.15140525838622e-05, + "loss": 0.9205, + "step": 986 + }, + { + "epoch": 0.44741613780598366, + "grad_norm": 1.6017113416133755, + "learning_rate": 7.158658204895739e-05, + "loss": 0.9268, + "step": 987 + }, + { + "epoch": 0.44786944696282865, + "grad_norm": 0.9925648829984232, + "learning_rate": 7.165911151405259e-05, + "loss": 0.926, + "step": 988 + }, + { + "epoch": 0.4483227561196736, + "grad_norm": 1.580300006432411, + "learning_rate": 7.173164097914778e-05, + "loss": 0.9295, + "step": 989 + }, + { + "epoch": 0.4487760652765186, + "grad_norm": 1.3785101751945472, + "learning_rate": 7.180417044424298e-05, + "loss": 0.9155, + "step": 990 + }, + { + "epoch": 0.4492293744333635, + "grad_norm": 1.9267176644549786, + "learning_rate": 7.187669990933817e-05, + "loss": 0.9396, + "step": 991 + }, + { + "epoch": 0.4496826835902085, + "grad_norm": 1.568814431355168, + "learning_rate": 7.194922937443337e-05, + "loss": 0.9509, + "step": 992 + }, + { + "epoch": 0.4501359927470535, + "grad_norm": 1.5381497981036112, + "learning_rate": 7.202175883952857e-05, + "loss": 0.9323, + "step": 993 + }, + { + "epoch": 0.45058930190389845, + "grad_norm": 1.6920723928858534, + "learning_rate": 7.209428830462376e-05, + "loss": 0.9194, + "step": 994 + }, + { + "epoch": 0.45104261106074345, + "grad_norm": 1.3137412035397498, + "learning_rate": 7.216681776971895e-05, + "loss": 0.9234, + "step": 995 + }, + { + "epoch": 0.4514959202175884, + "grad_norm": 1.9229419765415254, + "learning_rate": 7.223934723481414e-05, + "loss": 0.948, + "step": 996 + }, + { + "epoch": 0.4519492293744334, + "grad_norm": 1.239145594527727, + "learning_rate": 7.231187669990934e-05, + "loss": 0.9514, + "step": 997 + }, + { + "epoch": 0.4524025385312783, + "grad_norm": 1.6807576549877472, + "learning_rate": 7.238440616500453e-05, + "loss": 0.9326, + "step": 998 + }, + { + "epoch": 0.4528558476881233, + "grad_norm": 2.103544411539409, + "learning_rate": 7.245693563009974e-05, + "loss": 0.9547, + "step": 999 + }, + { + "epoch": 0.45330915684496825, + "grad_norm": 0.9493300849718785, + "learning_rate": 7.252946509519492e-05, + "loss": 0.9291, + "step": 1000 + }, + { + "epoch": 0.45376246600181325, + "grad_norm": 1.0630335941273665, + "learning_rate": 7.260199456029013e-05, + "loss": 0.9218, + "step": 1001 + }, + { + "epoch": 0.4542157751586582, + "grad_norm": 1.3308930955965812, + "learning_rate": 7.267452402538532e-05, + "loss": 0.9596, + "step": 1002 + }, + { + "epoch": 0.4546690843155032, + "grad_norm": 2.150122918889247, + "learning_rate": 7.274705349048052e-05, + "loss": 0.9426, + "step": 1003 + }, + { + "epoch": 0.4551223934723481, + "grad_norm": 1.6883739049117814, + "learning_rate": 7.281958295557571e-05, + "loss": 0.9241, + "step": 1004 + }, + { + "epoch": 0.4555757026291931, + "grad_norm": 1.417905367224721, + "learning_rate": 7.289211242067091e-05, + "loss": 0.9238, + "step": 1005 + }, + { + "epoch": 0.45602901178603805, + "grad_norm": 1.5758494544426542, + "learning_rate": 7.29646418857661e-05, + "loss": 0.9361, + "step": 1006 + }, + { + "epoch": 0.45648232094288305, + "grad_norm": 1.2800568807162367, + "learning_rate": 7.303717135086129e-05, + "loss": 0.9421, + "step": 1007 + }, + { + "epoch": 0.45693563009972804, + "grad_norm": 2.3279690848714707, + "learning_rate": 7.310970081595649e-05, + "loss": 0.9439, + "step": 1008 + }, + { + "epoch": 0.457388939256573, + "grad_norm": 1.2441247004377207, + "learning_rate": 7.318223028105168e-05, + "loss": 0.9351, + "step": 1009 + }, + { + "epoch": 0.457842248413418, + "grad_norm": 1.8489834056260686, + "learning_rate": 7.325475974614688e-05, + "loss": 0.9571, + "step": 1010 + }, + { + "epoch": 0.4582955575702629, + "grad_norm": 1.18632322569053, + "learning_rate": 7.332728921124207e-05, + "loss": 0.9328, + "step": 1011 + }, + { + "epoch": 0.4587488667271079, + "grad_norm": 2.130749616424102, + "learning_rate": 7.339981867633727e-05, + "loss": 0.9397, + "step": 1012 + }, + { + "epoch": 0.45920217588395285, + "grad_norm": 1.8435400088620908, + "learning_rate": 7.347234814143246e-05, + "loss": 0.9469, + "step": 1013 + }, + { + "epoch": 0.45965548504079784, + "grad_norm": 1.5466130380534353, + "learning_rate": 7.354487760652766e-05, + "loss": 0.9468, + "step": 1014 + }, + { + "epoch": 0.4601087941976428, + "grad_norm": 1.7102975341037632, + "learning_rate": 7.361740707162285e-05, + "loss": 0.9448, + "step": 1015 + }, + { + "epoch": 0.4605621033544878, + "grad_norm": 1.696463562610429, + "learning_rate": 7.368993653671806e-05, + "loss": 0.9411, + "step": 1016 + }, + { + "epoch": 0.4610154125113327, + "grad_norm": 1.413872623013266, + "learning_rate": 7.376246600181324e-05, + "loss": 0.9633, + "step": 1017 + }, + { + "epoch": 0.4614687216681777, + "grad_norm": 1.6939628491066452, + "learning_rate": 7.383499546690843e-05, + "loss": 0.9352, + "step": 1018 + }, + { + "epoch": 0.46192203082502264, + "grad_norm": 1.2403461460914742, + "learning_rate": 7.390752493200364e-05, + "loss": 0.9384, + "step": 1019 + }, + { + "epoch": 0.46237533998186764, + "grad_norm": 1.8243041776791569, + "learning_rate": 7.398005439709882e-05, + "loss": 0.934, + "step": 1020 + }, + { + "epoch": 0.4628286491387126, + "grad_norm": 1.5417445980794333, + "learning_rate": 7.405258386219401e-05, + "loss": 0.942, + "step": 1021 + }, + { + "epoch": 0.46328195829555757, + "grad_norm": 1.4465034253728488, + "learning_rate": 7.412511332728922e-05, + "loss": 0.9036, + "step": 1022 + }, + { + "epoch": 0.4637352674524025, + "grad_norm": 1.0618175991445225, + "learning_rate": 7.41976427923844e-05, + "loss": 0.946, + "step": 1023 + }, + { + "epoch": 0.4641885766092475, + "grad_norm": 1.8891330937985298, + "learning_rate": 7.427017225747961e-05, + "loss": 0.938, + "step": 1024 + }, + { + "epoch": 0.4646418857660925, + "grad_norm": 1.821385470525838, + "learning_rate": 7.434270172257481e-05, + "loss": 0.9531, + "step": 1025 + }, + { + "epoch": 0.46509519492293744, + "grad_norm": 1.2519590241371303, + "learning_rate": 7.441523118767e-05, + "loss": 0.9259, + "step": 1026 + }, + { + "epoch": 0.46554850407978243, + "grad_norm": 1.4444719397577677, + "learning_rate": 7.44877606527652e-05, + "loss": 0.9378, + "step": 1027 + }, + { + "epoch": 0.46600181323662737, + "grad_norm": 1.326672762837731, + "learning_rate": 7.456029011786039e-05, + "loss": 0.9247, + "step": 1028 + }, + { + "epoch": 0.46645512239347237, + "grad_norm": 0.8931438874695452, + "learning_rate": 7.463281958295558e-05, + "loss": 0.9371, + "step": 1029 + }, + { + "epoch": 0.4669084315503173, + "grad_norm": 1.722591982160326, + "learning_rate": 7.470534904805077e-05, + "loss": 0.9191, + "step": 1030 + }, + { + "epoch": 0.4673617407071623, + "grad_norm": 1.3548159637884043, + "learning_rate": 7.477787851314597e-05, + "loss": 0.9276, + "step": 1031 + }, + { + "epoch": 0.46781504986400724, + "grad_norm": 2.0565819452148775, + "learning_rate": 7.485040797824116e-05, + "loss": 0.9261, + "step": 1032 + }, + { + "epoch": 0.46826835902085223, + "grad_norm": 1.6696871761574352, + "learning_rate": 7.492293744333636e-05, + "loss": 0.9372, + "step": 1033 + }, + { + "epoch": 0.46872166817769717, + "grad_norm": 0.82577844790418, + "learning_rate": 7.499546690843155e-05, + "loss": 0.9207, + "step": 1034 + }, + { + "epoch": 0.46917497733454216, + "grad_norm": 1.9443843401600236, + "learning_rate": 7.506799637352675e-05, + "loss": 0.9354, + "step": 1035 + }, + { + "epoch": 0.4696282864913871, + "grad_norm": 2.0138320341947327, + "learning_rate": 7.514052583862194e-05, + "loss": 0.9227, + "step": 1036 + }, + { + "epoch": 0.4700815956482321, + "grad_norm": 1.1650082821792327, + "learning_rate": 7.521305530371714e-05, + "loss": 0.9328, + "step": 1037 + }, + { + "epoch": 0.47053490480507704, + "grad_norm": 1.2838474467658123, + "learning_rate": 7.528558476881233e-05, + "loss": 0.9362, + "step": 1038 + }, + { + "epoch": 0.47098821396192203, + "grad_norm": 1.0453670570329108, + "learning_rate": 7.535811423390754e-05, + "loss": 0.9229, + "step": 1039 + }, + { + "epoch": 0.471441523118767, + "grad_norm": 2.0724878446960027, + "learning_rate": 7.543064369900273e-05, + "loss": 0.962, + "step": 1040 + }, + { + "epoch": 0.47189483227561196, + "grad_norm": 1.3337189561193297, + "learning_rate": 7.550317316409791e-05, + "loss": 0.9352, + "step": 1041 + }, + { + "epoch": 0.47234814143245696, + "grad_norm": 2.693288053840458, + "learning_rate": 7.557570262919312e-05, + "loss": 0.9483, + "step": 1042 + }, + { + "epoch": 0.4728014505893019, + "grad_norm": 1.3480366895442522, + "learning_rate": 7.56482320942883e-05, + "loss": 0.9276, + "step": 1043 + }, + { + "epoch": 0.4732547597461469, + "grad_norm": 3.269449355440945, + "learning_rate": 7.572076155938351e-05, + "loss": 0.9343, + "step": 1044 + }, + { + "epoch": 0.47370806890299183, + "grad_norm": 2.5634358365156875, + "learning_rate": 7.57932910244787e-05, + "loss": 0.9544, + "step": 1045 + }, + { + "epoch": 0.4741613780598368, + "grad_norm": 2.2653530336177266, + "learning_rate": 7.58658204895739e-05, + "loss": 0.9343, + "step": 1046 + }, + { + "epoch": 0.47461468721668176, + "grad_norm": 2.0650438598951073, + "learning_rate": 7.593834995466909e-05, + "loss": 0.9451, + "step": 1047 + }, + { + "epoch": 0.47506799637352676, + "grad_norm": 1.946766269386176, + "learning_rate": 7.601087941976429e-05, + "loss": 0.9368, + "step": 1048 + }, + { + "epoch": 0.4755213055303717, + "grad_norm": 1.9837057639824804, + "learning_rate": 7.608340888485948e-05, + "loss": 0.9326, + "step": 1049 + }, + { + "epoch": 0.4759746146872167, + "grad_norm": 1.7696256080976231, + "learning_rate": 7.615593834995468e-05, + "loss": 0.9364, + "step": 1050 + }, + { + "epoch": 0.47642792384406163, + "grad_norm": 1.6890934312484855, + "learning_rate": 7.622846781504987e-05, + "loss": 0.9371, + "step": 1051 + }, + { + "epoch": 0.4768812330009066, + "grad_norm": 2.7295061742800923, + "learning_rate": 7.630099728014506e-05, + "loss": 0.9653, + "step": 1052 + }, + { + "epoch": 0.47733454215775156, + "grad_norm": 1.700768595181305, + "learning_rate": 7.637352674524026e-05, + "loss": 0.9319, + "step": 1053 + }, + { + "epoch": 0.47778785131459656, + "grad_norm": 2.660900188884576, + "learning_rate": 7.644605621033545e-05, + "loss": 0.9156, + "step": 1054 + }, + { + "epoch": 0.4782411604714415, + "grad_norm": 1.8192445686234342, + "learning_rate": 7.651858567543064e-05, + "loss": 0.9432, + "step": 1055 + }, + { + "epoch": 0.4786944696282865, + "grad_norm": 2.61197597226296, + "learning_rate": 7.659111514052584e-05, + "loss": 0.9189, + "step": 1056 + }, + { + "epoch": 0.4791477787851315, + "grad_norm": 1.723966847834538, + "learning_rate": 7.666364460562105e-05, + "loss": 0.9382, + "step": 1057 + }, + { + "epoch": 0.4796010879419764, + "grad_norm": 1.7936903514703189, + "learning_rate": 7.673617407071623e-05, + "loss": 0.9424, + "step": 1058 + }, + { + "epoch": 0.4800543970988214, + "grad_norm": 1.9636976024452146, + "learning_rate": 7.680870353581144e-05, + "loss": 0.9529, + "step": 1059 + }, + { + "epoch": 0.48050770625566636, + "grad_norm": 1.3505731920784558, + "learning_rate": 7.688123300090663e-05, + "loss": 0.9118, + "step": 1060 + }, + { + "epoch": 0.48096101541251135, + "grad_norm": 1.2878004364864275, + "learning_rate": 7.695376246600183e-05, + "loss": 0.9216, + "step": 1061 + }, + { + "epoch": 0.4814143245693563, + "grad_norm": 2.6822139725243637, + "learning_rate": 7.702629193109702e-05, + "loss": 0.9486, + "step": 1062 + }, + { + "epoch": 0.4818676337262013, + "grad_norm": 1.4387186659833817, + "learning_rate": 7.70988213961922e-05, + "loss": 0.9162, + "step": 1063 + }, + { + "epoch": 0.4823209428830462, + "grad_norm": 3.3730033560990575, + "learning_rate": 7.717135086128741e-05, + "loss": 0.96, + "step": 1064 + }, + { + "epoch": 0.4827742520398912, + "grad_norm": 2.436141421851693, + "learning_rate": 7.72438803263826e-05, + "loss": 0.9433, + "step": 1065 + }, + { + "epoch": 0.48322756119673616, + "grad_norm": 3.2372033928135977, + "learning_rate": 7.731640979147779e-05, + "loss": 0.9245, + "step": 1066 + }, + { + "epoch": 0.48368087035358115, + "grad_norm": 2.260827881164444, + "learning_rate": 7.738893925657299e-05, + "loss": 0.9502, + "step": 1067 + }, + { + "epoch": 0.4841341795104261, + "grad_norm": 3.584449133109595, + "learning_rate": 7.746146872166818e-05, + "loss": 0.957, + "step": 1068 + }, + { + "epoch": 0.4845874886672711, + "grad_norm": 3.089701408521804, + "learning_rate": 7.753399818676338e-05, + "loss": 0.9348, + "step": 1069 + }, + { + "epoch": 0.485040797824116, + "grad_norm": 2.298167656109134, + "learning_rate": 7.760652765185857e-05, + "loss": 0.9082, + "step": 1070 + }, + { + "epoch": 0.485494106980961, + "grad_norm": 2.1529663896142273, + "learning_rate": 7.767905711695377e-05, + "loss": 0.9233, + "step": 1071 + }, + { + "epoch": 0.485947416137806, + "grad_norm": 2.467847476419537, + "learning_rate": 7.775158658204897e-05, + "loss": 0.9277, + "step": 1072 + }, + { + "epoch": 0.48640072529465095, + "grad_norm": 1.7350022379515349, + "learning_rate": 7.782411604714416e-05, + "loss": 0.9274, + "step": 1073 + }, + { + "epoch": 0.48685403445149594, + "grad_norm": 3.203641244436875, + "learning_rate": 7.789664551223935e-05, + "loss": 0.9306, + "step": 1074 + }, + { + "epoch": 0.4873073436083409, + "grad_norm": 2.9372658375626886, + "learning_rate": 7.796917497733455e-05, + "loss": 0.9453, + "step": 1075 + }, + { + "epoch": 0.4877606527651859, + "grad_norm": 2.2765481770407003, + "learning_rate": 7.804170444242974e-05, + "loss": 0.9351, + "step": 1076 + }, + { + "epoch": 0.4882139619220308, + "grad_norm": 1.8437389756468139, + "learning_rate": 7.811423390752493e-05, + "loss": 0.9491, + "step": 1077 + }, + { + "epoch": 0.4886672710788758, + "grad_norm": 3.007804058290887, + "learning_rate": 7.818676337262013e-05, + "loss": 0.9533, + "step": 1078 + }, + { + "epoch": 0.48912058023572075, + "grad_norm": 2.3478189255955613, + "learning_rate": 7.825929283771532e-05, + "loss": 0.9308, + "step": 1079 + }, + { + "epoch": 0.48957388939256574, + "grad_norm": 2.6457468583946366, + "learning_rate": 7.833182230281053e-05, + "loss": 0.9334, + "step": 1080 + }, + { + "epoch": 0.4900271985494107, + "grad_norm": 2.486224967214583, + "learning_rate": 7.840435176790571e-05, + "loss": 0.9388, + "step": 1081 + }, + { + "epoch": 0.4904805077062557, + "grad_norm": 2.238906257278672, + "learning_rate": 7.847688123300092e-05, + "loss": 0.9277, + "step": 1082 + }, + { + "epoch": 0.4909338168631006, + "grad_norm": 1.8644857548968845, + "learning_rate": 7.85494106980961e-05, + "loss": 0.9227, + "step": 1083 + }, + { + "epoch": 0.4913871260199456, + "grad_norm": 2.7479078988924313, + "learning_rate": 7.862194016319131e-05, + "loss": 0.9429, + "step": 1084 + }, + { + "epoch": 0.49184043517679055, + "grad_norm": 2.4983002854989325, + "learning_rate": 7.86944696282865e-05, + "loss": 0.9331, + "step": 1085 + }, + { + "epoch": 0.49229374433363554, + "grad_norm": 2.2531732778404154, + "learning_rate": 7.876699909338169e-05, + "loss": 0.9152, + "step": 1086 + }, + { + "epoch": 0.4927470534904805, + "grad_norm": 2.0842759437699603, + "learning_rate": 7.883952855847689e-05, + "loss": 0.9418, + "step": 1087 + }, + { + "epoch": 0.4932003626473255, + "grad_norm": 2.303072083728707, + "learning_rate": 7.891205802357208e-05, + "loss": 0.9364, + "step": 1088 + }, + { + "epoch": 0.49365367180417047, + "grad_norm": 1.8734469770043933, + "learning_rate": 7.898458748866728e-05, + "loss": 0.9462, + "step": 1089 + }, + { + "epoch": 0.4941069809610154, + "grad_norm": 2.774160319433171, + "learning_rate": 7.905711695376247e-05, + "loss": 0.9294, + "step": 1090 + }, + { + "epoch": 0.4945602901178604, + "grad_norm": 2.5669691956525025, + "learning_rate": 7.912964641885767e-05, + "loss": 0.919, + "step": 1091 + }, + { + "epoch": 0.49501359927470534, + "grad_norm": 1.8666203696065424, + "learning_rate": 7.920217588395286e-05, + "loss": 0.9266, + "step": 1092 + }, + { + "epoch": 0.49546690843155033, + "grad_norm": 1.7351050736888558, + "learning_rate": 7.927470534904806e-05, + "loss": 0.9531, + "step": 1093 + }, + { + "epoch": 0.4959202175883953, + "grad_norm": 2.6803966572788314, + "learning_rate": 7.934723481414325e-05, + "loss": 0.9467, + "step": 1094 + }, + { + "epoch": 0.49637352674524027, + "grad_norm": 2.289641312674465, + "learning_rate": 7.941976427923845e-05, + "loss": 0.9038, + "step": 1095 + }, + { + "epoch": 0.4968268359020852, + "grad_norm": 2.425630692644342, + "learning_rate": 7.949229374433364e-05, + "loss": 0.932, + "step": 1096 + }, + { + "epoch": 0.4972801450589302, + "grad_norm": 2.2846383408461426, + "learning_rate": 7.956482320942883e-05, + "loss": 0.9415, + "step": 1097 + }, + { + "epoch": 0.49773345421577514, + "grad_norm": 2.0699897017654996, + "learning_rate": 7.963735267452403e-05, + "loss": 0.9381, + "step": 1098 + }, + { + "epoch": 0.49818676337262013, + "grad_norm": 1.6040296720231002, + "learning_rate": 7.970988213961922e-05, + "loss": 0.9077, + "step": 1099 + }, + { + "epoch": 0.4986400725294651, + "grad_norm": 2.9515464867478127, + "learning_rate": 7.978241160471441e-05, + "loss": 0.9248, + "step": 1100 + }, + { + "epoch": 0.49909338168631007, + "grad_norm": 2.7109636171352878, + "learning_rate": 7.985494106980962e-05, + "loss": 0.9466, + "step": 1101 + }, + { + "epoch": 0.499546690843155, + "grad_norm": 1.7343425595722934, + "learning_rate": 7.99274705349048e-05, + "loss": 0.9299, + "step": 1102 + }, + { + "epoch": 0.5, + "grad_norm": 1.4764011414013212, + "learning_rate": 8e-05, + "loss": 0.9386, + "step": 1103 + }, + { + "epoch": 0.5004533091568449, + "grad_norm": 2.695670112637328, + "learning_rate": 7.999999799694122e-05, + "loss": 0.9023, + "step": 1104 + }, + { + "epoch": 0.50090661831369, + "grad_norm": 2.2925051754476518, + "learning_rate": 7.99999919877651e-05, + "loss": 0.9321, + "step": 1105 + }, + { + "epoch": 0.5013599274705349, + "grad_norm": 2.2740432513920323, + "learning_rate": 7.99999819724722e-05, + "loss": 0.9286, + "step": 1106 + }, + { + "epoch": 0.5018132366273799, + "grad_norm": 2.2060117856176227, + "learning_rate": 7.999996795106356e-05, + "loss": 0.9207, + "step": 1107 + }, + { + "epoch": 0.5022665457842248, + "grad_norm": 2.15396850113845, + "learning_rate": 7.999994992354056e-05, + "loss": 0.9383, + "step": 1108 + }, + { + "epoch": 0.5027198549410699, + "grad_norm": 1.8069551985266765, + "learning_rate": 7.999992788990503e-05, + "loss": 0.9442, + "step": 1109 + }, + { + "epoch": 0.5031731640979148, + "grad_norm": 2.3582115363757667, + "learning_rate": 7.999990185015916e-05, + "loss": 0.9224, + "step": 1110 + }, + { + "epoch": 0.5036264732547597, + "grad_norm": 2.0277008831098047, + "learning_rate": 7.999987180430555e-05, + "loss": 0.9092, + "step": 1111 + }, + { + "epoch": 0.5040797824116047, + "grad_norm": 2.2899359432657618, + "learning_rate": 7.999983775234725e-05, + "loss": 0.9266, + "step": 1112 + }, + { + "epoch": 0.5045330915684497, + "grad_norm": 1.9631974550532858, + "learning_rate": 7.999979969428761e-05, + "loss": 0.9345, + "step": 1113 + }, + { + "epoch": 0.5049864007252947, + "grad_norm": 2.1380498594261144, + "learning_rate": 7.99997576301305e-05, + "loss": 0.9049, + "step": 1114 + }, + { + "epoch": 0.5054397098821396, + "grad_norm": 1.9504991230032644, + "learning_rate": 7.999971155988009e-05, + "loss": 0.9342, + "step": 1115 + }, + { + "epoch": 0.5058930190389845, + "grad_norm": 2.341121499622892, + "learning_rate": 7.999966148354102e-05, + "loss": 0.9186, + "step": 1116 + }, + { + "epoch": 0.5063463281958296, + "grad_norm": 1.9971894307381017, + "learning_rate": 7.999960740111828e-05, + "loss": 0.9294, + "step": 1117 + }, + { + "epoch": 0.5067996373526745, + "grad_norm": 2.0086681838912948, + "learning_rate": 7.999954931261732e-05, + "loss": 0.9267, + "step": 1118 + }, + { + "epoch": 0.5072529465095195, + "grad_norm": 1.6778690218469297, + "learning_rate": 7.999948721804394e-05, + "loss": 0.9151, + "step": 1119 + }, + { + "epoch": 0.5077062556663645, + "grad_norm": 2.530369648740528, + "learning_rate": 7.999942111740435e-05, + "loss": 0.9356, + "step": 1120 + }, + { + "epoch": 0.5081595648232095, + "grad_norm": 2.312473866902849, + "learning_rate": 7.999935101070518e-05, + "loss": 0.941, + "step": 1121 + }, + { + "epoch": 0.5086128739800544, + "grad_norm": 1.6581041886500867, + "learning_rate": 7.999927689795345e-05, + "loss": 0.952, + "step": 1122 + }, + { + "epoch": 0.5090661831368993, + "grad_norm": 1.3904681149437466, + "learning_rate": 7.999919877915658e-05, + "loss": 0.9541, + "step": 1123 + }, + { + "epoch": 0.5095194922937444, + "grad_norm": 2.497886380008753, + "learning_rate": 7.999911665432241e-05, + "loss": 0.9381, + "step": 1124 + }, + { + "epoch": 0.5099728014505893, + "grad_norm": 1.8857460370201293, + "learning_rate": 7.999903052345913e-05, + "loss": 0.9345, + "step": 1125 + }, + { + "epoch": 0.5104261106074343, + "grad_norm": 2.1569080203720494, + "learning_rate": 7.99989403865754e-05, + "loss": 0.9197, + "step": 1126 + }, + { + "epoch": 0.5108794197642792, + "grad_norm": 1.9615290493573445, + "learning_rate": 7.999884624368025e-05, + "loss": 0.9378, + "step": 1127 + }, + { + "epoch": 0.5113327289211242, + "grad_norm": 1.993796741702161, + "learning_rate": 7.999874809478308e-05, + "loss": 0.9248, + "step": 1128 + }, + { + "epoch": 0.5117860380779692, + "grad_norm": 1.510655058083619, + "learning_rate": 7.999864593989372e-05, + "loss": 0.9496, + "step": 1129 + }, + { + "epoch": 0.5122393472348141, + "grad_norm": 2.4776453926090585, + "learning_rate": 7.999853977902244e-05, + "loss": 0.9512, + "step": 1130 + }, + { + "epoch": 0.5126926563916591, + "grad_norm": 2.103657395433103, + "learning_rate": 7.999842961217983e-05, + "loss": 0.9367, + "step": 1131 + }, + { + "epoch": 0.5131459655485041, + "grad_norm": 1.9812817697375218, + "learning_rate": 7.999831543937695e-05, + "loss": 0.9307, + "step": 1132 + }, + { + "epoch": 0.513599274705349, + "grad_norm": 1.697164342042479, + "learning_rate": 7.999819726062522e-05, + "loss": 0.9359, + "step": 1133 + }, + { + "epoch": 0.514052583862194, + "grad_norm": 1.9589356983806658, + "learning_rate": 7.999807507593648e-05, + "loss": 0.946, + "step": 1134 + }, + { + "epoch": 0.514505893019039, + "grad_norm": 1.4351637273159832, + "learning_rate": 7.999794888532299e-05, + "loss": 0.9311, + "step": 1135 + }, + { + "epoch": 0.514959202175884, + "grad_norm": 2.502173387766251, + "learning_rate": 7.999781868879735e-05, + "loss": 0.9466, + "step": 1136 + }, + { + "epoch": 0.5154125113327289, + "grad_norm": 2.0352046025474713, + "learning_rate": 7.999768448637261e-05, + "loss": 0.9388, + "step": 1137 + }, + { + "epoch": 0.5158658204895739, + "grad_norm": 1.881696184478307, + "learning_rate": 7.999754627806223e-05, + "loss": 0.9446, + "step": 1138 + }, + { + "epoch": 0.5163191296464189, + "grad_norm": 1.8376949912854326, + "learning_rate": 7.999740406388004e-05, + "loss": 0.9462, + "step": 1139 + }, + { + "epoch": 0.5167724388032638, + "grad_norm": 1.8035638595031986, + "learning_rate": 7.999725784384029e-05, + "loss": 0.931, + "step": 1140 + }, + { + "epoch": 0.5172257479601088, + "grad_norm": 1.3988736284030177, + "learning_rate": 7.99971076179576e-05, + "loss": 0.9448, + "step": 1141 + }, + { + "epoch": 0.5176790571169537, + "grad_norm": 1.368310315317596, + "learning_rate": 7.999695338624704e-05, + "loss": 0.9166, + "step": 1142 + }, + { + "epoch": 0.5181323662737988, + "grad_norm": 1.1847295092246892, + "learning_rate": 7.999679514872404e-05, + "loss": 0.9199, + "step": 1143 + }, + { + "epoch": 0.5185856754306437, + "grad_norm": 1.6034456483432264, + "learning_rate": 7.999663290540447e-05, + "loss": 0.9346, + "step": 1144 + }, + { + "epoch": 0.5190389845874886, + "grad_norm": 1.1223924091267217, + "learning_rate": 7.999646665630457e-05, + "loss": 0.9315, + "step": 1145 + }, + { + "epoch": 0.5194922937443336, + "grad_norm": 2.2708987539565246, + "learning_rate": 7.999629640144097e-05, + "loss": 0.9488, + "step": 1146 + }, + { + "epoch": 0.5199456029011786, + "grad_norm": 1.7764572636641531, + "learning_rate": 7.999612214083075e-05, + "loss": 0.9195, + "step": 1147 + }, + { + "epoch": 0.5203989120580236, + "grad_norm": 1.550365515234303, + "learning_rate": 7.999594387449135e-05, + "loss": 0.9383, + "step": 1148 + }, + { + "epoch": 0.5208522212148685, + "grad_norm": 1.7845138918426924, + "learning_rate": 7.999576160244063e-05, + "loss": 0.9345, + "step": 1149 + }, + { + "epoch": 0.5213055303717135, + "grad_norm": 0.9496993566026093, + "learning_rate": 7.999557532469683e-05, + "loss": 0.9338, + "step": 1150 + }, + { + "epoch": 0.5217588395285585, + "grad_norm": 1.4664221535426507, + "learning_rate": 7.999538504127863e-05, + "loss": 0.9516, + "step": 1151 + }, + { + "epoch": 0.5222121486854034, + "grad_norm": 1.0281250562577597, + "learning_rate": 7.999519075220506e-05, + "loss": 0.9194, + "step": 1152 + }, + { + "epoch": 0.5226654578422484, + "grad_norm": 1.5976559252038354, + "learning_rate": 7.999499245749559e-05, + "loss": 0.9383, + "step": 1153 + }, + { + "epoch": 0.5231187669990934, + "grad_norm": 1.654659855912099, + "learning_rate": 7.99947901571701e-05, + "loss": 0.9203, + "step": 1154 + }, + { + "epoch": 0.5235720761559384, + "grad_norm": 0.9466408928244222, + "learning_rate": 7.999458385124881e-05, + "loss": 0.9326, + "step": 1155 + }, + { + "epoch": 0.5240253853127833, + "grad_norm": 1.8925501229806927, + "learning_rate": 7.999437353975243e-05, + "loss": 0.9143, + "step": 1156 + }, + { + "epoch": 0.5244786944696282, + "grad_norm": 1.4996031490785258, + "learning_rate": 7.999415922270198e-05, + "loss": 0.9327, + "step": 1157 + }, + { + "epoch": 0.5249320036264733, + "grad_norm": 1.4119384062680873, + "learning_rate": 7.999394090011896e-05, + "loss": 0.9482, + "step": 1158 + }, + { + "epoch": 0.5253853127833182, + "grad_norm": 1.1803796683777192, + "learning_rate": 7.99937185720252e-05, + "loss": 0.9299, + "step": 1159 + }, + { + "epoch": 0.5258386219401632, + "grad_norm": 1.8550921726156167, + "learning_rate": 7.999349223844299e-05, + "loss": 0.9373, + "step": 1160 + }, + { + "epoch": 0.5262919310970081, + "grad_norm": 1.1433555743769983, + "learning_rate": 7.9993261899395e-05, + "loss": 0.9201, + "step": 1161 + }, + { + "epoch": 0.5267452402538532, + "grad_norm": 1.5592202948602767, + "learning_rate": 7.999302755490429e-05, + "loss": 0.932, + "step": 1162 + }, + { + "epoch": 0.5271985494106981, + "grad_norm": 1.2656121878911029, + "learning_rate": 7.999278920499434e-05, + "loss": 0.9446, + "step": 1163 + }, + { + "epoch": 0.527651858567543, + "grad_norm": 1.2928889801566954, + "learning_rate": 7.9992546849689e-05, + "loss": 0.9319, + "step": 1164 + }, + { + "epoch": 0.528105167724388, + "grad_norm": 1.3797198159110364, + "learning_rate": 7.999230048901257e-05, + "loss": 0.9183, + "step": 1165 + }, + { + "epoch": 0.528558476881233, + "grad_norm": 1.5960311777159253, + "learning_rate": 7.999205012298972e-05, + "loss": 0.9271, + "step": 1166 + }, + { + "epoch": 0.529011786038078, + "grad_norm": 0.9653553560637108, + "learning_rate": 7.99917957516455e-05, + "loss": 0.9255, + "step": 1167 + }, + { + "epoch": 0.5294650951949229, + "grad_norm": 1.7923802030136942, + "learning_rate": 7.999153737500539e-05, + "loss": 0.9154, + "step": 1168 + }, + { + "epoch": 0.529918404351768, + "grad_norm": 1.3826587408325095, + "learning_rate": 7.999127499309528e-05, + "loss": 0.9532, + "step": 1169 + }, + { + "epoch": 0.5303717135086129, + "grad_norm": 1.4846656499424202, + "learning_rate": 7.999100860594148e-05, + "loss": 0.9265, + "step": 1170 + }, + { + "epoch": 0.5308250226654578, + "grad_norm": 1.4535982006707384, + "learning_rate": 7.999073821357062e-05, + "loss": 0.9457, + "step": 1171 + }, + { + "epoch": 0.5312783318223028, + "grad_norm": 1.5863041789555974, + "learning_rate": 7.99904638160098e-05, + "loss": 0.9429, + "step": 1172 + }, + { + "epoch": 0.5317316409791478, + "grad_norm": 1.2546242586166059, + "learning_rate": 7.999018541328647e-05, + "loss": 0.9277, + "step": 1173 + }, + { + "epoch": 0.5321849501359928, + "grad_norm": 1.4142749526831937, + "learning_rate": 7.998990300542858e-05, + "loss": 0.9371, + "step": 1174 + }, + { + "epoch": 0.5326382592928377, + "grad_norm": 1.3169646383955897, + "learning_rate": 7.998961659246435e-05, + "loss": 0.9317, + "step": 1175 + }, + { + "epoch": 0.5330915684496826, + "grad_norm": 1.0731343832370837, + "learning_rate": 7.998932617442251e-05, + "loss": 0.9134, + "step": 1176 + }, + { + "epoch": 0.5335448776065277, + "grad_norm": 1.4176139039321232, + "learning_rate": 7.998903175133212e-05, + "loss": 0.9276, + "step": 1177 + }, + { + "epoch": 0.5339981867633726, + "grad_norm": 1.4422524326429709, + "learning_rate": 7.998873332322267e-05, + "loss": 0.9258, + "step": 1178 + }, + { + "epoch": 0.5344514959202176, + "grad_norm": 1.5488328519118832, + "learning_rate": 7.998843089012406e-05, + "loss": 0.9338, + "step": 1179 + }, + { + "epoch": 0.5349048050770625, + "grad_norm": 1.1924329779085927, + "learning_rate": 7.998812445206657e-05, + "loss": 0.929, + "step": 1180 + }, + { + "epoch": 0.5353581142339076, + "grad_norm": 1.5778714592525653, + "learning_rate": 7.998781400908089e-05, + "loss": 0.9354, + "step": 1181 + }, + { + "epoch": 0.5358114233907525, + "grad_norm": 1.4769312241470074, + "learning_rate": 7.998749956119812e-05, + "loss": 0.9259, + "step": 1182 + }, + { + "epoch": 0.5362647325475974, + "grad_norm": 1.0079471194190697, + "learning_rate": 7.998718110844973e-05, + "loss": 0.9068, + "step": 1183 + }, + { + "epoch": 0.5367180417044425, + "grad_norm": 1.9703380999389104, + "learning_rate": 7.998685865086766e-05, + "loss": 0.9184, + "step": 1184 + }, + { + "epoch": 0.5371713508612874, + "grad_norm": 1.0223939694929478, + "learning_rate": 7.998653218848416e-05, + "loss": 0.9493, + "step": 1185 + }, + { + "epoch": 0.5376246600181324, + "grad_norm": 1.2875613889844806, + "learning_rate": 7.998620172133194e-05, + "loss": 0.9417, + "step": 1186 + }, + { + "epoch": 0.5380779691749773, + "grad_norm": 1.5459725293825066, + "learning_rate": 7.998586724944411e-05, + "loss": 0.9454, + "step": 1187 + }, + { + "epoch": 0.5385312783318223, + "grad_norm": 1.054289505560474, + "learning_rate": 7.998552877285417e-05, + "loss": 0.9192, + "step": 1188 + }, + { + "epoch": 0.5389845874886673, + "grad_norm": 1.817488217352224, + "learning_rate": 7.998518629159599e-05, + "loss": 0.9387, + "step": 1189 + }, + { + "epoch": 0.5394378966455122, + "grad_norm": 1.162264441379759, + "learning_rate": 7.998483980570389e-05, + "loss": 0.9366, + "step": 1190 + }, + { + "epoch": 0.5398912058023572, + "grad_norm": 1.9368595731406375, + "learning_rate": 7.998448931521257e-05, + "loss": 0.9244, + "step": 1191 + }, + { + "epoch": 0.5403445149592022, + "grad_norm": 1.1519053366737986, + "learning_rate": 7.998413482015714e-05, + "loss": 0.9509, + "step": 1192 + }, + { + "epoch": 0.5407978241160472, + "grad_norm": 2.133965877373918, + "learning_rate": 7.998377632057308e-05, + "loss": 0.9249, + "step": 1193 + }, + { + "epoch": 0.5412511332728921, + "grad_norm": 1.7186559651137223, + "learning_rate": 7.998341381649634e-05, + "loss": 0.926, + "step": 1194 + }, + { + "epoch": 0.541704442429737, + "grad_norm": 1.3636483375018569, + "learning_rate": 7.998304730796318e-05, + "loss": 0.9326, + "step": 1195 + }, + { + "epoch": 0.5421577515865821, + "grad_norm": 1.4906512702901547, + "learning_rate": 7.998267679501031e-05, + "loss": 0.9365, + "step": 1196 + }, + { + "epoch": 0.542611060743427, + "grad_norm": 1.2461624683024761, + "learning_rate": 7.998230227767486e-05, + "loss": 0.929, + "step": 1197 + }, + { + "epoch": 0.543064369900272, + "grad_norm": 1.4783227903279204, + "learning_rate": 7.998192375599435e-05, + "loss": 0.9255, + "step": 1198 + }, + { + "epoch": 0.543517679057117, + "grad_norm": 1.2780109133170123, + "learning_rate": 7.998154123000666e-05, + "loss": 0.9361, + "step": 1199 + }, + { + "epoch": 0.543970988213962, + "grad_norm": 1.452412540590342, + "learning_rate": 7.998115469975011e-05, + "loss": 0.9511, + "step": 1200 + }, + { + "epoch": 0.5444242973708069, + "grad_norm": 1.1088623461061695, + "learning_rate": 7.998076416526342e-05, + "loss": 0.963, + "step": 1201 + }, + { + "epoch": 0.5448776065276518, + "grad_norm": 1.6431126138045689, + "learning_rate": 7.998036962658569e-05, + "loss": 0.9382, + "step": 1202 + }, + { + "epoch": 0.5453309156844969, + "grad_norm": 0.9347390343264433, + "learning_rate": 7.997997108375644e-05, + "loss": 0.9279, + "step": 1203 + }, + { + "epoch": 0.5457842248413418, + "grad_norm": 1.7019736890586254, + "learning_rate": 7.99795685368156e-05, + "loss": 0.9393, + "step": 1204 + }, + { + "epoch": 0.5462375339981868, + "grad_norm": 1.3404063688894605, + "learning_rate": 7.997916198580346e-05, + "loss": 0.901, + "step": 1205 + }, + { + "epoch": 0.5466908431550317, + "grad_norm": 1.360721351867984, + "learning_rate": 7.997875143076075e-05, + "loss": 0.9416, + "step": 1206 + }, + { + "epoch": 0.5471441523118767, + "grad_norm": 1.151947628961951, + "learning_rate": 7.99783368717286e-05, + "loss": 0.949, + "step": 1207 + }, + { + "epoch": 0.5475974614687217, + "grad_norm": 1.5328976542994077, + "learning_rate": 7.997791830874851e-05, + "loss": 0.9304, + "step": 1208 + }, + { + "epoch": 0.5480507706255666, + "grad_norm": 2.661320412141175, + "learning_rate": 7.99774957418624e-05, + "loss": 0.981, + "step": 1209 + }, + { + "epoch": 0.5485040797824116, + "grad_norm": 2.093724040810777, + "learning_rate": 7.997706917111263e-05, + "loss": 0.9455, + "step": 1210 + }, + { + "epoch": 0.5489573889392566, + "grad_norm": 3.5408644929709077, + "learning_rate": 7.997663859654188e-05, + "loss": 0.9449, + "step": 1211 + }, + { + "epoch": 0.5494106980961015, + "grad_norm": 2.4651567078238648, + "learning_rate": 7.997620401819327e-05, + "loss": 0.9304, + "step": 1212 + }, + { + "epoch": 0.5498640072529465, + "grad_norm": 458.4225741727906, + "learning_rate": 7.997576543611035e-05, + "loss": 5.2168, + "step": 1213 + }, + { + "epoch": 0.5503173164097914, + "grad_norm": 313.7124630810254, + "learning_rate": 7.997532285033703e-05, + "loss": 6.9433, + "step": 1214 + }, + { + "epoch": 0.5507706255666365, + "grad_norm": 6.360882298827062, + "learning_rate": 7.997487626091764e-05, + "loss": 1.0227, + "step": 1215 + }, + { + "epoch": 0.5512239347234814, + "grad_norm": 3.158002357006737, + "learning_rate": 7.997442566789692e-05, + "loss": 0.9592, + "step": 1216 + }, + { + "epoch": 0.5516772438803264, + "grad_norm": 3.692694630644349, + "learning_rate": 7.997397107131998e-05, + "loss": 0.9621, + "step": 1217 + }, + { + "epoch": 0.5521305530371714, + "grad_norm": 4.965938920823364, + "learning_rate": 7.997351247123235e-05, + "loss": 0.9933, + "step": 1218 + }, + { + "epoch": 0.5525838621940163, + "grad_norm": 8.78811334609886, + "learning_rate": 7.997304986767997e-05, + "loss": 1.2122, + "step": 1219 + }, + { + "epoch": 0.5530371713508613, + "grad_norm": 92.38026935671674, + "learning_rate": 7.997258326070917e-05, + "loss": 2.2687, + "step": 1220 + }, + { + "epoch": 0.5534904805077062, + "grad_norm": 81.58266715608298, + "learning_rate": 7.997211265036668e-05, + "loss": 1.5842, + "step": 1221 + }, + { + "epoch": 0.5539437896645513, + "grad_norm": 11.537055091633448, + "learning_rate": 7.997163803669964e-05, + "loss": 1.3824, + "step": 1222 + }, + { + "epoch": 0.5543970988213962, + "grad_norm": 7.456925964856799, + "learning_rate": 7.997115941975556e-05, + "loss": 1.154, + "step": 1223 + }, + { + "epoch": 0.5548504079782411, + "grad_norm": 6.303112625292072, + "learning_rate": 7.99706767995824e-05, + "loss": 1.1473, + "step": 1224 + }, + { + "epoch": 0.5553037171350861, + "grad_norm": 3.3845628152008422, + "learning_rate": 7.997019017622848e-05, + "loss": 1.0683, + "step": 1225 + }, + { + "epoch": 0.5557570262919311, + "grad_norm": 1.7646647907574375, + "learning_rate": 7.996969954974255e-05, + "loss": 1.0392, + "step": 1226 + }, + { + "epoch": 0.5562103354487761, + "grad_norm": 4.115075438053526, + "learning_rate": 7.996920492017373e-05, + "loss": 1.0611, + "step": 1227 + }, + { + "epoch": 0.556663644605621, + "grad_norm": 3.3623463084528726, + "learning_rate": 7.996870628757159e-05, + "loss": 1.1618, + "step": 1228 + }, + { + "epoch": 0.557116953762466, + "grad_norm": 48.15532672489662, + "learning_rate": 7.996820365198603e-05, + "loss": 2.1243, + "step": 1229 + }, + { + "epoch": 0.557570262919311, + "grad_norm": 21.134767678430123, + "learning_rate": 7.996769701346741e-05, + "loss": 1.5914, + "step": 1230 + }, + { + "epoch": 0.5580235720761559, + "grad_norm": 75.78279999219518, + "learning_rate": 7.996718637206649e-05, + "loss": 5.1073, + "step": 1231 + }, + { + "epoch": 0.5584768812330009, + "grad_norm": 126.13380963663259, + "learning_rate": 7.996667172783438e-05, + "loss": 6.331, + "step": 1232 + }, + { + "epoch": 0.5589301903898459, + "grad_norm": 51.96350051580755, + "learning_rate": 7.996615308082265e-05, + "loss": 1.5288, + "step": 1233 + }, + { + "epoch": 0.5593834995466909, + "grad_norm": 28.80271282637818, + "learning_rate": 7.996563043108322e-05, + "loss": 2.1403, + "step": 1234 + }, + { + "epoch": 0.5598368087035358, + "grad_norm": 97.37830535250542, + "learning_rate": 7.996510377866844e-05, + "loss": 2.3812, + "step": 1235 + }, + { + "epoch": 0.5602901178603807, + "grad_norm": 49.158381903799615, + "learning_rate": 7.996457312363107e-05, + "loss": 3.1171, + "step": 1236 + }, + { + "epoch": 0.5607434270172258, + "grad_norm": 12.38216600435011, + "learning_rate": 7.996403846602424e-05, + "loss": 1.8746, + "step": 1237 + }, + { + "epoch": 0.5611967361740707, + "grad_norm": 4.324586692507766, + "learning_rate": 7.99634998059015e-05, + "loss": 1.4435, + "step": 1238 + }, + { + "epoch": 0.5616500453309157, + "grad_norm": 3.3462980222683325, + "learning_rate": 7.996295714331683e-05, + "loss": 1.4027, + "step": 1239 + }, + { + "epoch": 0.5621033544877606, + "grad_norm": 3.605305558795563, + "learning_rate": 7.996241047832453e-05, + "loss": 1.3149, + "step": 1240 + }, + { + "epoch": 0.5625566636446057, + "grad_norm": 4.471247860157743, + "learning_rate": 7.996185981097939e-05, + "loss": 1.2441, + "step": 1241 + }, + { + "epoch": 0.5630099728014506, + "grad_norm": 2.885832077838531, + "learning_rate": 7.996130514133653e-05, + "loss": 1.2062, + "step": 1242 + }, + { + "epoch": 0.5634632819582955, + "grad_norm": 2.6359807712590837, + "learning_rate": 7.996074646945152e-05, + "loss": 1.1397, + "step": 1243 + }, + { + "epoch": 0.5639165911151405, + "grad_norm": 2.7926486576159095, + "learning_rate": 7.996018379538031e-05, + "loss": 1.0891, + "step": 1244 + }, + { + "epoch": 0.5643699002719855, + "grad_norm": 2.5005195936382316, + "learning_rate": 7.995961711917926e-05, + "loss": 1.1666, + "step": 1245 + }, + { + "epoch": 0.5648232094288305, + "grad_norm": 2.584819171582866, + "learning_rate": 7.99590464409051e-05, + "loss": 1.1488, + "step": 1246 + }, + { + "epoch": 0.5652765185856754, + "grad_norm": 61.156546306424815, + "learning_rate": 7.995847176061501e-05, + "loss": 1.5666, + "step": 1247 + }, + { + "epoch": 0.5657298277425205, + "grad_norm": 3.021898463670056, + "learning_rate": 7.995789307836653e-05, + "loss": 1.1849, + "step": 1248 + }, + { + "epoch": 0.5661831368993654, + "grad_norm": 1.6544180970863034, + "learning_rate": 7.995731039421763e-05, + "loss": 1.1253, + "step": 1249 + }, + { + "epoch": 0.5666364460562103, + "grad_norm": 2.6431154418775753, + "learning_rate": 7.995672370822667e-05, + "loss": 1.1008, + "step": 1250 + }, + { + "epoch": 0.5670897552130553, + "grad_norm": 2.200698773121035, + "learning_rate": 7.995613302045239e-05, + "loss": 1.1226, + "step": 1251 + }, + { + "epoch": 0.5675430643699003, + "grad_norm": 26.565601334752664, + "learning_rate": 7.995553833095397e-05, + "loss": 1.3236, + "step": 1252 + }, + { + "epoch": 0.5679963735267453, + "grad_norm": 6.4679922883095236, + "learning_rate": 7.995493963979094e-05, + "loss": 1.2446, + "step": 1253 + }, + { + "epoch": 0.5684496826835902, + "grad_norm": 3.493636257874444, + "learning_rate": 7.995433694702329e-05, + "loss": 1.1342, + "step": 1254 + }, + { + "epoch": 0.5689029918404351, + "grad_norm": 1.675744974231861, + "learning_rate": 7.995373025271138e-05, + "loss": 1.0905, + "step": 1255 + }, + { + "epoch": 0.5693563009972802, + "grad_norm": 1.8046539521240326, + "learning_rate": 7.995311955691595e-05, + "loss": 1.0564, + "step": 1256 + }, + { + "epoch": 0.5698096101541251, + "grad_norm": 1.8734850415748778, + "learning_rate": 7.995250485969818e-05, + "loss": 1.0821, + "step": 1257 + }, + { + "epoch": 0.5702629193109701, + "grad_norm": 1.9209030292315845, + "learning_rate": 7.995188616111963e-05, + "loss": 1.1502, + "step": 1258 + }, + { + "epoch": 0.570716228467815, + "grad_norm": 1.5317055867851805, + "learning_rate": 7.995126346124226e-05, + "loss": 1.0637, + "step": 1259 + }, + { + "epoch": 0.57116953762466, + "grad_norm": 1.5788066546923587, + "learning_rate": 7.995063676012845e-05, + "loss": 1.0521, + "step": 1260 + }, + { + "epoch": 0.571622846781505, + "grad_norm": 1.8816817566078492, + "learning_rate": 7.995000605784095e-05, + "loss": 1.0306, + "step": 1261 + }, + { + "epoch": 0.5720761559383499, + "grad_norm": 2.884448257782959, + "learning_rate": 7.994937135444293e-05, + "loss": 1.0761, + "step": 1262 + }, + { + "epoch": 0.572529465095195, + "grad_norm": 1.8975965334554525, + "learning_rate": 7.994873264999798e-05, + "loss": 1.0934, + "step": 1263 + }, + { + "epoch": 0.5729827742520399, + "grad_norm": 2.233408106192115, + "learning_rate": 7.994808994457002e-05, + "loss": 1.0733, + "step": 1264 + }, + { + "epoch": 0.5734360834088849, + "grad_norm": 1.5195251817120963, + "learning_rate": 7.994744323822347e-05, + "loss": 1.0589, + "step": 1265 + }, + { + "epoch": 0.5738893925657298, + "grad_norm": 1.1418443348878775, + "learning_rate": 7.994679253102305e-05, + "loss": 1.0423, + "step": 1266 + }, + { + "epoch": 0.5743427017225748, + "grad_norm": 1.6079964726642064, + "learning_rate": 7.994613782303397e-05, + "loss": 1.0534, + "step": 1267 + }, + { + "epoch": 0.5747960108794198, + "grad_norm": 1.7167391757859043, + "learning_rate": 7.994547911432179e-05, + "loss": 1.0216, + "step": 1268 + }, + { + "epoch": 0.5752493200362647, + "grad_norm": 1.953630930445719, + "learning_rate": 7.994481640495248e-05, + "loss": 1.0546, + "step": 1269 + }, + { + "epoch": 0.5757026291931097, + "grad_norm": 1.634860135400939, + "learning_rate": 7.994414969499241e-05, + "loss": 1.0288, + "step": 1270 + }, + { + "epoch": 0.5761559383499547, + "grad_norm": 1.4514753665306004, + "learning_rate": 7.994347898450835e-05, + "loss": 1.0309, + "step": 1271 + }, + { + "epoch": 0.5766092475067996, + "grad_norm": 2.0419521742444844, + "learning_rate": 7.994280427356748e-05, + "loss": 1.001, + "step": 1272 + }, + { + "epoch": 0.5770625566636446, + "grad_norm": 1.8733591217118115, + "learning_rate": 7.994212556223737e-05, + "loss": 1.0026, + "step": 1273 + }, + { + "epoch": 0.5775158658204895, + "grad_norm": 1.6836158781503174, + "learning_rate": 7.994144285058601e-05, + "loss": 1.0068, + "step": 1274 + }, + { + "epoch": 0.5779691749773346, + "grad_norm": 1.1864778050994311, + "learning_rate": 7.994075613868175e-05, + "loss": 1.0058, + "step": 1275 + }, + { + "epoch": 0.5784224841341795, + "grad_norm": 1.8067729507238013, + "learning_rate": 7.994006542659338e-05, + "loss": 0.9946, + "step": 1276 + }, + { + "epoch": 0.5788757932910245, + "grad_norm": 1.5073763024346656, + "learning_rate": 7.993937071439009e-05, + "loss": 0.9928, + "step": 1277 + }, + { + "epoch": 0.5793291024478694, + "grad_norm": 1.927566117439918, + "learning_rate": 7.993867200214143e-05, + "loss": 0.9681, + "step": 1278 + }, + { + "epoch": 0.5797824116047144, + "grad_norm": 1.8000031188328407, + "learning_rate": 7.993796928991739e-05, + "loss": 0.9995, + "step": 1279 + }, + { + "epoch": 0.5802357207615594, + "grad_norm": 1.907114469392778, + "learning_rate": 7.993726257778835e-05, + "loss": 0.9595, + "step": 1280 + }, + { + "epoch": 0.5806890299184043, + "grad_norm": 1.808329208408959, + "learning_rate": 7.99365518658251e-05, + "loss": 0.9383, + "step": 1281 + }, + { + "epoch": 0.5811423390752494, + "grad_norm": 1.6880834866555432, + "learning_rate": 7.99358371540988e-05, + "loss": 0.9799, + "step": 1282 + }, + { + "epoch": 0.5815956482320943, + "grad_norm": 1.2229675518457344, + "learning_rate": 7.993511844268104e-05, + "loss": 0.9725, + "step": 1283 + }, + { + "epoch": 0.5820489573889392, + "grad_norm": 1.5338957692342234, + "learning_rate": 7.99343957316438e-05, + "loss": 0.9913, + "step": 1284 + }, + { + "epoch": 0.5825022665457842, + "grad_norm": 1.1766546562858549, + "learning_rate": 7.993366902105947e-05, + "loss": 0.9664, + "step": 1285 + }, + { + "epoch": 0.5829555757026292, + "grad_norm": 1.2346382879692552, + "learning_rate": 7.993293831100082e-05, + "loss": 0.9574, + "step": 1286 + }, + { + "epoch": 0.5834088848594742, + "grad_norm": 1.6965676322990313, + "learning_rate": 7.993220360154104e-05, + "loss": 0.9602, + "step": 1287 + }, + { + "epoch": 0.5838621940163191, + "grad_norm": 1.918588837761549, + "learning_rate": 7.99314648927537e-05, + "loss": 0.9952, + "step": 1288 + }, + { + "epoch": 0.584315503173164, + "grad_norm": 1.621675240384828, + "learning_rate": 7.99307221847128e-05, + "loss": 0.9499, + "step": 1289 + }, + { + "epoch": 0.5847688123300091, + "grad_norm": 1.3984305779589283, + "learning_rate": 7.992997547749273e-05, + "loss": 0.9474, + "step": 1290 + }, + { + "epoch": 0.585222121486854, + "grad_norm": 2.838204096053592, + "learning_rate": 7.992922477116824e-05, + "loss": 0.9669, + "step": 1291 + }, + { + "epoch": 0.585675430643699, + "grad_norm": 2.011885667719017, + "learning_rate": 7.992847006581456e-05, + "loss": 0.9472, + "step": 1292 + }, + { + "epoch": 0.5861287398005439, + "grad_norm": 2.8116067743456004, + "learning_rate": 7.992771136150725e-05, + "loss": 0.9471, + "step": 1293 + }, + { + "epoch": 0.586582048957389, + "grad_norm": 2.3974385914188003, + "learning_rate": 7.99269486583223e-05, + "loss": 0.9646, + "step": 1294 + }, + { + "epoch": 0.5870353581142339, + "grad_norm": 2.588487426700054, + "learning_rate": 7.99261819563361e-05, + "loss": 0.9845, + "step": 1295 + }, + { + "epoch": 0.5874886672710788, + "grad_norm": 1.9946409335867252, + "learning_rate": 7.992541125562544e-05, + "loss": 0.9658, + "step": 1296 + }, + { + "epoch": 0.5879419764279239, + "grad_norm": 2.266049837869208, + "learning_rate": 7.992463655626751e-05, + "loss": 0.9722, + "step": 1297 + }, + { + "epoch": 0.5883952855847688, + "grad_norm": 1.5701754684562426, + "learning_rate": 7.992385785833988e-05, + "loss": 0.9642, + "step": 1298 + }, + { + "epoch": 0.5888485947416138, + "grad_norm": 2.202067463055082, + "learning_rate": 7.992307516192055e-05, + "loss": 0.969, + "step": 1299 + }, + { + "epoch": 0.5893019038984587, + "grad_norm": 1.8095347433948126, + "learning_rate": 7.992228846708792e-05, + "loss": 0.9603, + "step": 1300 + }, + { + "epoch": 0.5897552130553038, + "grad_norm": 1.6509282478582363, + "learning_rate": 7.992149777392077e-05, + "loss": 0.973, + "step": 1301 + }, + { + "epoch": 0.5902085222121487, + "grad_norm": 2.2915044849714423, + "learning_rate": 7.992070308249828e-05, + "loss": 0.9433, + "step": 1302 + }, + { + "epoch": 0.5906618313689936, + "grad_norm": 1.2766915249749922, + "learning_rate": 7.991990439290005e-05, + "loss": 0.9688, + "step": 1303 + }, + { + "epoch": 0.5911151405258386, + "grad_norm": 3.2523074456173195, + "learning_rate": 7.991910170520608e-05, + "loss": 0.9348, + "step": 1304 + }, + { + "epoch": 0.5915684496826836, + "grad_norm": 2.7062886295586894, + "learning_rate": 7.991829501949676e-05, + "loss": 0.9717, + "step": 1305 + }, + { + "epoch": 0.5920217588395286, + "grad_norm": 2.3460593100765474, + "learning_rate": 7.991748433585288e-05, + "loss": 0.9519, + "step": 1306 + }, + { + "epoch": 0.5924750679963735, + "grad_norm": 2.050284758739638, + "learning_rate": 7.991666965435562e-05, + "loss": 0.9528, + "step": 1307 + }, + { + "epoch": 0.5929283771532184, + "grad_norm": 2.3173908452231053, + "learning_rate": 7.991585097508658e-05, + "loss": 0.968, + "step": 1308 + }, + { + "epoch": 0.5933816863100635, + "grad_norm": 1.664965170663303, + "learning_rate": 7.991502829812775e-05, + "loss": 0.9529, + "step": 1309 + }, + { + "epoch": 0.5938349954669084, + "grad_norm": 2.8134508013011033, + "learning_rate": 7.991420162356154e-05, + "loss": 0.9718, + "step": 1310 + }, + { + "epoch": 0.5942883046237534, + "grad_norm": 2.0525387646376596, + "learning_rate": 7.991337095147072e-05, + "loss": 0.9534, + "step": 1311 + }, + { + "epoch": 0.5947416137805984, + "grad_norm": 2.8079330954088935, + "learning_rate": 7.99125362819385e-05, + "loss": 0.9752, + "step": 1312 + }, + { + "epoch": 0.5951949229374434, + "grad_norm": 2.557892818975597, + "learning_rate": 7.991169761504847e-05, + "loss": 0.9403, + "step": 1313 + }, + { + "epoch": 0.5956482320942883, + "grad_norm": 2.12967401460863, + "learning_rate": 7.991085495088464e-05, + "loss": 0.9375, + "step": 1314 + }, + { + "epoch": 0.5961015412511332, + "grad_norm": 1.9215189073485524, + "learning_rate": 7.991000828953137e-05, + "loss": 0.951, + "step": 1315 + }, + { + "epoch": 0.5965548504079783, + "grad_norm": 2.3037538539618345, + "learning_rate": 7.990915763107347e-05, + "loss": 0.9353, + "step": 1316 + }, + { + "epoch": 0.5970081595648232, + "grad_norm": 1.805875867735959, + "learning_rate": 7.990830297559617e-05, + "loss": 0.9474, + "step": 1317 + }, + { + "epoch": 0.5974614687216682, + "grad_norm": 2.8327858032680733, + "learning_rate": 7.990744432318502e-05, + "loss": 0.9417, + "step": 1318 + }, + { + "epoch": 0.5979147778785131, + "grad_norm": 2.4619755996781345, + "learning_rate": 7.990658167392604e-05, + "loss": 0.9295, + "step": 1319 + }, + { + "epoch": 0.5983680870353582, + "grad_norm": 2.186343817657232, + "learning_rate": 7.990571502790563e-05, + "loss": 0.9544, + "step": 1320 + }, + { + "epoch": 0.5988213961922031, + "grad_norm": 2.1313860980545902, + "learning_rate": 7.990484438521057e-05, + "loss": 0.9239, + "step": 1321 + }, + { + "epoch": 0.599274705349048, + "grad_norm": 2.0511770893164254, + "learning_rate": 7.990396974592807e-05, + "loss": 0.9394, + "step": 1322 + }, + { + "epoch": 0.599728014505893, + "grad_norm": 1.7728446426029978, + "learning_rate": 7.990309111014572e-05, + "loss": 0.9418, + "step": 1323 + }, + { + "epoch": 0.600181323662738, + "grad_norm": 2.526656642597573, + "learning_rate": 7.990220847795153e-05, + "loss": 0.9463, + "step": 1324 + }, + { + "epoch": 0.600634632819583, + "grad_norm": 2.303369493056542, + "learning_rate": 7.990132184943388e-05, + "loss": 0.9578, + "step": 1325 + }, + { + "epoch": 0.6010879419764279, + "grad_norm": 1.9275862722963868, + "learning_rate": 7.990043122468159e-05, + "loss": 0.9241, + "step": 1326 + }, + { + "epoch": 0.601541251133273, + "grad_norm": 1.6665595644686135, + "learning_rate": 7.989953660378383e-05, + "loss": 0.9378, + "step": 1327 + }, + { + "epoch": 0.6019945602901179, + "grad_norm": 2.3225013070577667, + "learning_rate": 7.989863798683024e-05, + "loss": 0.9304, + "step": 1328 + }, + { + "epoch": 0.6024478694469628, + "grad_norm": 1.915796748216751, + "learning_rate": 7.989773537391077e-05, + "loss": 0.9569, + "step": 1329 + }, + { + "epoch": 0.6029011786038078, + "grad_norm": 2.5688479199689462, + "learning_rate": 7.989682876511588e-05, + "loss": 0.934, + "step": 1330 + }, + { + "epoch": 0.6033544877606528, + "grad_norm": 2.329107561217384, + "learning_rate": 7.989591816053631e-05, + "loss": 0.9347, + "step": 1331 + }, + { + "epoch": 0.6038077969174978, + "grad_norm": 1.7195539103111102, + "learning_rate": 7.989500356026328e-05, + "loss": 0.9449, + "step": 1332 + }, + { + "epoch": 0.6042611060743427, + "grad_norm": 1.406267482256429, + "learning_rate": 7.989408496438841e-05, + "loss": 0.9568, + "step": 1333 + }, + { + "epoch": 0.6047144152311876, + "grad_norm": 2.6848622742825228, + "learning_rate": 7.989316237300368e-05, + "loss": 0.9483, + "step": 1334 + }, + { + "epoch": 0.6051677243880327, + "grad_norm": 2.162609365332896, + "learning_rate": 7.989223578620149e-05, + "loss": 0.9599, + "step": 1335 + }, + { + "epoch": 0.6056210335448776, + "grad_norm": 2.0175932601545865, + "learning_rate": 7.989130520407464e-05, + "loss": 0.9449, + "step": 1336 + }, + { + "epoch": 0.6060743427017226, + "grad_norm": 1.779589988335226, + "learning_rate": 7.989037062671634e-05, + "loss": 0.9265, + "step": 1337 + }, + { + "epoch": 0.6065276518585675, + "grad_norm": 2.1826721080643052, + "learning_rate": 7.988943205422018e-05, + "loss": 0.9348, + "step": 1338 + }, + { + "epoch": 0.6069809610154125, + "grad_norm": 1.717886833298261, + "learning_rate": 7.988848948668018e-05, + "loss": 0.9413, + "step": 1339 + }, + { + "epoch": 0.6074342701722575, + "grad_norm": 2.2886083528021426, + "learning_rate": 7.988754292419073e-05, + "loss": 0.9215, + "step": 1340 + }, + { + "epoch": 0.6078875793291024, + "grad_norm": 1.9866475715961616, + "learning_rate": 7.988659236684662e-05, + "loss": 0.9438, + "step": 1341 + }, + { + "epoch": 0.6083408884859474, + "grad_norm": 2.0295994444821113, + "learning_rate": 7.988563781474305e-05, + "loss": 0.9531, + "step": 1342 + }, + { + "epoch": 0.6087941976427924, + "grad_norm": 1.5789802353855236, + "learning_rate": 7.988467926797566e-05, + "loss": 0.9425, + "step": 1343 + }, + { + "epoch": 0.6092475067996374, + "grad_norm": 2.3128570759172025, + "learning_rate": 7.988371672664039e-05, + "loss": 0.9562, + "step": 1344 + }, + { + "epoch": 0.6097008159564823, + "grad_norm": 1.7763972510444845, + "learning_rate": 7.98827501908337e-05, + "loss": 0.9389, + "step": 1345 + }, + { + "epoch": 0.6101541251133273, + "grad_norm": 2.359117774790098, + "learning_rate": 7.988177966065235e-05, + "loss": 0.9398, + "step": 1346 + }, + { + "epoch": 0.6106074342701723, + "grad_norm": 1.9886390515000991, + "learning_rate": 7.988080513619356e-05, + "loss": 0.9358, + "step": 1347 + }, + { + "epoch": 0.6110607434270172, + "grad_norm": 2.22824617226276, + "learning_rate": 7.987982661755492e-05, + "loss": 0.9424, + "step": 1348 + }, + { + "epoch": 0.6115140525838622, + "grad_norm": 1.906587053746284, + "learning_rate": 7.987884410483446e-05, + "loss": 0.9463, + "step": 1349 + }, + { + "epoch": 0.6119673617407072, + "grad_norm": 2.1392049623450737, + "learning_rate": 7.987785759813055e-05, + "loss": 0.9347, + "step": 1350 + }, + { + "epoch": 0.6124206708975521, + "grad_norm": 1.8769844890974132, + "learning_rate": 7.9876867097542e-05, + "loss": 0.943, + "step": 1351 + }, + { + "epoch": 0.6128739800543971, + "grad_norm": 2.1337407439509306, + "learning_rate": 7.987587260316802e-05, + "loss": 0.9289, + "step": 1352 + }, + { + "epoch": 0.613327289211242, + "grad_norm": 1.7303312413707148, + "learning_rate": 7.987487411510819e-05, + "loss": 0.9369, + "step": 1353 + }, + { + "epoch": 0.6137805983680871, + "grad_norm": 2.2776315754828045, + "learning_rate": 7.987387163346255e-05, + "loss": 0.9286, + "step": 1354 + }, + { + "epoch": 0.614233907524932, + "grad_norm": 1.9913971121024345, + "learning_rate": 7.987286515833146e-05, + "loss": 0.9322, + "step": 1355 + }, + { + "epoch": 0.614687216681777, + "grad_norm": 1.9592980400938038, + "learning_rate": 7.987185468981574e-05, + "loss": 0.9151, + "step": 1356 + }, + { + "epoch": 0.6151405258386219, + "grad_norm": 1.7373336694492125, + "learning_rate": 7.98708402280166e-05, + "loss": 0.9459, + "step": 1357 + }, + { + "epoch": 0.6155938349954669, + "grad_norm": 2.089939243807353, + "learning_rate": 7.986982177303564e-05, + "loss": 0.9236, + "step": 1358 + }, + { + "epoch": 0.6160471441523119, + "grad_norm": 1.7025325541452467, + "learning_rate": 7.986879932497485e-05, + "loss": 0.9217, + "step": 1359 + }, + { + "epoch": 0.6165004533091568, + "grad_norm": 2.3096298590643203, + "learning_rate": 7.986777288393663e-05, + "loss": 0.9287, + "step": 1360 + }, + { + "epoch": 0.6169537624660019, + "grad_norm": 2.0271215119783506, + "learning_rate": 7.98667424500238e-05, + "loss": 0.9447, + "step": 1361 + }, + { + "epoch": 0.6174070716228468, + "grad_norm": 1.8882906808108777, + "learning_rate": 7.986570802333954e-05, + "loss": 0.9269, + "step": 1362 + }, + { + "epoch": 0.6178603807796917, + "grad_norm": 1.610086784862466, + "learning_rate": 7.986466960398744e-05, + "loss": 0.9241, + "step": 1363 + }, + { + "epoch": 0.6183136899365367, + "grad_norm": 2.1475935050972903, + "learning_rate": 7.986362719207153e-05, + "loss": 0.9232, + "step": 1364 + }, + { + "epoch": 0.6187669990933817, + "grad_norm": 1.588744859714186, + "learning_rate": 7.98625807876962e-05, + "loss": 0.9199, + "step": 1365 + }, + { + "epoch": 0.6192203082502267, + "grad_norm": 2.352295992110243, + "learning_rate": 7.986153039096625e-05, + "loss": 0.9317, + "step": 1366 + }, + { + "epoch": 0.6196736174070716, + "grad_norm": 2.0995241256689825, + "learning_rate": 7.98604760019869e-05, + "loss": 0.9327, + "step": 1367 + }, + { + "epoch": 0.6201269265639165, + "grad_norm": 1.8197922833989448, + "learning_rate": 7.985941762086371e-05, + "loss": 0.9342, + "step": 1368 + }, + { + "epoch": 0.6205802357207616, + "grad_norm": 2.0819771219754437, + "learning_rate": 7.985835524770271e-05, + "loss": 0.9953, + "step": 1369 + }, + { + "epoch": 0.6210335448776065, + "grad_norm": 1.2857633386561893, + "learning_rate": 7.985728888261027e-05, + "loss": 0.9169, + "step": 1370 + }, + { + "epoch": 0.6214868540344515, + "grad_norm": 1.2483107286995725, + "learning_rate": 7.985621852569323e-05, + "loss": 0.9301, + "step": 1371 + }, + { + "epoch": 0.6219401631912964, + "grad_norm": 0.89864803664589, + "learning_rate": 7.985514417705877e-05, + "loss": 0.9325, + "step": 1372 + }, + { + "epoch": 0.6223934723481415, + "grad_norm": 1.09216750285962, + "learning_rate": 7.985406583681449e-05, + "loss": 0.9302, + "step": 1373 + }, + { + "epoch": 0.6228467815049864, + "grad_norm": 1.292905323669219, + "learning_rate": 7.985298350506837e-05, + "loss": 0.9301, + "step": 1374 + }, + { + "epoch": 0.6233000906618313, + "grad_norm": 1.1108136847149448, + "learning_rate": 7.985189718192884e-05, + "loss": 0.9462, + "step": 1375 + }, + { + "epoch": 0.6237533998186764, + "grad_norm": 1.1866522145441745, + "learning_rate": 7.985080686750468e-05, + "loss": 0.9292, + "step": 1376 + }, + { + "epoch": 0.6242067089755213, + "grad_norm": 1.4535054206120084, + "learning_rate": 7.98497125619051e-05, + "loss": 0.9456, + "step": 1377 + }, + { + "epoch": 0.6246600181323663, + "grad_norm": 1.5542611638723598, + "learning_rate": 7.984861426523968e-05, + "loss": 0.956, + "step": 1378 + }, + { + "epoch": 0.6251133272892112, + "grad_norm": 0.6866864281773077, + "learning_rate": 7.984751197761845e-05, + "loss": 0.9383, + "step": 1379 + }, + { + "epoch": 0.6255666364460563, + "grad_norm": 1.6741905060897702, + "learning_rate": 7.984640569915176e-05, + "loss": 0.9312, + "step": 1380 + }, + { + "epoch": 0.6260199456029012, + "grad_norm": 1.311452940152574, + "learning_rate": 7.984529542995043e-05, + "loss": 0.9386, + "step": 1381 + }, + { + "epoch": 0.6264732547597461, + "grad_norm": 1.1132470777439514, + "learning_rate": 7.984418117012568e-05, + "loss": 0.9467, + "step": 1382 + }, + { + "epoch": 0.6269265639165911, + "grad_norm": 1.264760185545391, + "learning_rate": 7.984306291978908e-05, + "loss": 0.9292, + "step": 1383 + }, + { + "epoch": 0.6273798730734361, + "grad_norm": 1.3272033012507551, + "learning_rate": 7.984194067905263e-05, + "loss": 0.9171, + "step": 1384 + }, + { + "epoch": 0.6278331822302811, + "grad_norm": 1.3557194290938646, + "learning_rate": 7.984081444802872e-05, + "loss": 0.9251, + "step": 1385 + }, + { + "epoch": 0.628286491387126, + "grad_norm": 1.0980682868674414, + "learning_rate": 7.983968422683018e-05, + "loss": 0.9239, + "step": 1386 + }, + { + "epoch": 0.6287398005439709, + "grad_norm": 1.5934923601302393, + "learning_rate": 7.983855001557015e-05, + "loss": 0.9318, + "step": 1387 + }, + { + "epoch": 0.629193109700816, + "grad_norm": 0.7844962092886303, + "learning_rate": 7.983741181436225e-05, + "loss": 0.9413, + "step": 1388 + }, + { + "epoch": 0.6296464188576609, + "grad_norm": 1.3638088264805168, + "learning_rate": 7.98362696233205e-05, + "loss": 0.9567, + "step": 1389 + }, + { + "epoch": 0.6300997280145059, + "grad_norm": 1.139579570245386, + "learning_rate": 7.983512344255925e-05, + "loss": 0.9267, + "step": 1390 + }, + { + "epoch": 0.6305530371713509, + "grad_norm": 1.3610140489641078, + "learning_rate": 7.983397327219333e-05, + "loss": 0.9267, + "step": 1391 + }, + { + "epoch": 0.6310063463281959, + "grad_norm": 0.9320054369917435, + "learning_rate": 7.983281911233791e-05, + "loss": 0.9409, + "step": 1392 + }, + { + "epoch": 0.6314596554850408, + "grad_norm": 1.499860486745327, + "learning_rate": 7.983166096310859e-05, + "loss": 0.9552, + "step": 1393 + }, + { + "epoch": 0.6319129646418857, + "grad_norm": 1.1558453692058146, + "learning_rate": 7.983049882462135e-05, + "loss": 0.9162, + "step": 1394 + }, + { + "epoch": 0.6323662737987308, + "grad_norm": 1.3403899864824378, + "learning_rate": 7.98293326969926e-05, + "loss": 0.938, + "step": 1395 + }, + { + "epoch": 0.6328195829555757, + "grad_norm": 1.1588261461086737, + "learning_rate": 7.982816258033913e-05, + "loss": 0.9193, + "step": 1396 + }, + { + "epoch": 0.6332728921124207, + "grad_norm": 1.2331229247558908, + "learning_rate": 7.982698847477814e-05, + "loss": 0.9341, + "step": 1397 + }, + { + "epoch": 0.6337262012692656, + "grad_norm": 1.2653788870119165, + "learning_rate": 7.982581038042718e-05, + "loss": 0.9427, + "step": 1398 + }, + { + "epoch": 0.6341795104261106, + "grad_norm": 1.2767503401600189, + "learning_rate": 7.982462829740426e-05, + "loss": 0.921, + "step": 1399 + }, + { + "epoch": 0.6346328195829556, + "grad_norm": 1.3302808801174517, + "learning_rate": 7.982344222582779e-05, + "loss": 0.9346, + "step": 1400 + }, + { + "epoch": 0.6350861287398005, + "grad_norm": 0.92780265892514, + "learning_rate": 7.982225216581654e-05, + "loss": 0.9212, + "step": 1401 + }, + { + "epoch": 0.6355394378966455, + "grad_norm": 0.8800335726620268, + "learning_rate": 7.98210581174897e-05, + "loss": 0.9412, + "step": 1402 + }, + { + "epoch": 0.6359927470534905, + "grad_norm": 1.1153276690092782, + "learning_rate": 7.981986008096686e-05, + "loss": 0.9287, + "step": 1403 + }, + { + "epoch": 0.6364460562103355, + "grad_norm": 1.256485378493966, + "learning_rate": 7.9818658056368e-05, + "loss": 0.9164, + "step": 1404 + }, + { + "epoch": 0.6368993653671804, + "grad_norm": 1.2733483486197694, + "learning_rate": 7.98174520438135e-05, + "loss": 0.9435, + "step": 1405 + }, + { + "epoch": 0.6373526745240253, + "grad_norm": 1.3844232381262884, + "learning_rate": 7.981624204342417e-05, + "loss": 0.9215, + "step": 1406 + }, + { + "epoch": 0.6378059836808704, + "grad_norm": 1.201826875599897, + "learning_rate": 7.981502805532118e-05, + "loss": 0.9408, + "step": 1407 + }, + { + "epoch": 0.6382592928377153, + "grad_norm": 1.0322254041761534, + "learning_rate": 7.981381007962612e-05, + "loss": 0.9232, + "step": 1408 + }, + { + "epoch": 0.6387126019945603, + "grad_norm": 1.2185504932525735, + "learning_rate": 7.981258811646095e-05, + "loss": 0.9292, + "step": 1409 + }, + { + "epoch": 0.6391659111514053, + "grad_norm": 0.9029014426623471, + "learning_rate": 7.98113621659481e-05, + "loss": 0.9132, + "step": 1410 + }, + { + "epoch": 0.6396192203082502, + "grad_norm": 1.5644726001312201, + "learning_rate": 7.981013222821031e-05, + "loss": 0.9468, + "step": 1411 + }, + { + "epoch": 0.6400725294650952, + "grad_norm": 1.3038607170684196, + "learning_rate": 7.980889830337077e-05, + "loss": 0.9275, + "step": 1412 + }, + { + "epoch": 0.6405258386219401, + "grad_norm": 1.0410800257093935, + "learning_rate": 7.980766039155309e-05, + "loss": 0.9446, + "step": 1413 + }, + { + "epoch": 0.6409791477787852, + "grad_norm": 2.4141088118177843, + "learning_rate": 7.980641849288122e-05, + "loss": 0.9371, + "step": 1414 + }, + { + "epoch": 0.6414324569356301, + "grad_norm": 1.228897285490401, + "learning_rate": 7.980517260747954e-05, + "loss": 0.944, + "step": 1415 + }, + { + "epoch": 0.641885766092475, + "grad_norm": 2.4479424692656293, + "learning_rate": 7.980392273547285e-05, + "loss": 0.9284, + "step": 1416 + }, + { + "epoch": 0.64233907524932, + "grad_norm": 1.7091327710384099, + "learning_rate": 7.980266887698632e-05, + "loss": 0.9387, + "step": 1417 + }, + { + "epoch": 0.642792384406165, + "grad_norm": 2.3890263096655855, + "learning_rate": 7.980141103214551e-05, + "loss": 0.9221, + "step": 1418 + }, + { + "epoch": 0.64324569356301, + "grad_norm": 1.9193959280156798, + "learning_rate": 7.980014920107642e-05, + "loss": 0.9273, + "step": 1419 + }, + { + "epoch": 0.6436990027198549, + "grad_norm": 2.0164249198085744, + "learning_rate": 7.979888338390542e-05, + "loss": 0.9391, + "step": 1420 + }, + { + "epoch": 0.6441523118766999, + "grad_norm": 1.6119978119957605, + "learning_rate": 7.979761358075926e-05, + "loss": 0.9506, + "step": 1421 + }, + { + "epoch": 0.6446056210335449, + "grad_norm": 1.6736310107523196, + "learning_rate": 7.979633979176517e-05, + "loss": 0.9432, + "step": 1422 + }, + { + "epoch": 0.6450589301903898, + "grad_norm": 1.468712745019073, + "learning_rate": 7.979506201705067e-05, + "loss": 0.9467, + "step": 1423 + }, + { + "epoch": 0.6455122393472348, + "grad_norm": 0.9726777079324235, + "learning_rate": 7.979378025674376e-05, + "loss": 0.9193, + "step": 1424 + }, + { + "epoch": 0.6459655485040798, + "grad_norm": 1.6539041570232453, + "learning_rate": 7.97924945109728e-05, + "loss": 0.9453, + "step": 1425 + }, + { + "epoch": 0.6464188576609248, + "grad_norm": 1.3154314528445687, + "learning_rate": 7.979120477986658e-05, + "loss": 0.9241, + "step": 1426 + }, + { + "epoch": 0.6468721668177697, + "grad_norm": 1.7102290439434462, + "learning_rate": 7.978991106355423e-05, + "loss": 0.9502, + "step": 1427 + }, + { + "epoch": 0.6473254759746147, + "grad_norm": 1.3075561370817466, + "learning_rate": 7.978861336216537e-05, + "loss": 0.9403, + "step": 1428 + }, + { + "epoch": 0.6477787851314597, + "grad_norm": 1.9349112092499678, + "learning_rate": 7.978731167582995e-05, + "loss": 0.9435, + "step": 1429 + }, + { + "epoch": 0.6482320942883046, + "grad_norm": 1.570407751158983, + "learning_rate": 7.97860060046783e-05, + "loss": 0.9217, + "step": 1430 + }, + { + "epoch": 0.6486854034451496, + "grad_norm": 1.421079935982384, + "learning_rate": 7.978469634884125e-05, + "loss": 0.928, + "step": 1431 + }, + { + "epoch": 0.6491387126019945, + "grad_norm": 1.2841587321735604, + "learning_rate": 7.978338270844994e-05, + "loss": 0.94, + "step": 1432 + }, + { + "epoch": 0.6495920217588396, + "grad_norm": 1.3928384119436406, + "learning_rate": 7.97820650836359e-05, + "loss": 0.9398, + "step": 1433 + }, + { + "epoch": 0.6500453309156845, + "grad_norm": 1.3651014096147718, + "learning_rate": 7.978074347453115e-05, + "loss": 0.9246, + "step": 1434 + }, + { + "epoch": 0.6504986400725294, + "grad_norm": 1.2096046758140748, + "learning_rate": 7.977941788126802e-05, + "loss": 0.9657, + "step": 1435 + }, + { + "epoch": 0.6509519492293744, + "grad_norm": 1.1120361246873771, + "learning_rate": 7.977808830397929e-05, + "loss": 0.9461, + "step": 1436 + }, + { + "epoch": 0.6514052583862194, + "grad_norm": 1.2910511666765059, + "learning_rate": 7.977675474279809e-05, + "loss": 0.9219, + "step": 1437 + }, + { + "epoch": 0.6518585675430644, + "grad_norm": 1.1215445136594928, + "learning_rate": 7.977541719785801e-05, + "loss": 0.9141, + "step": 1438 + }, + { + "epoch": 0.6523118766999093, + "grad_norm": 1.6952095828596256, + "learning_rate": 7.9774075669293e-05, + "loss": 0.9366, + "step": 1439 + }, + { + "epoch": 0.6527651858567544, + "grad_norm": 0.9451610027782341, + "learning_rate": 7.977273015723741e-05, + "loss": 0.9203, + "step": 1440 + }, + { + "epoch": 0.6532184950135993, + "grad_norm": 1.3362321758857796, + "learning_rate": 7.977138066182603e-05, + "loss": 0.9232, + "step": 1441 + }, + { + "epoch": 0.6536718041704442, + "grad_norm": 1.0719712929789755, + "learning_rate": 7.977002718319396e-05, + "loss": 0.9324, + "step": 1442 + }, + { + "epoch": 0.6541251133272892, + "grad_norm": 1.3102656091343237, + "learning_rate": 7.976866972147682e-05, + "loss": 0.9207, + "step": 1443 + }, + { + "epoch": 0.6545784224841342, + "grad_norm": 1.396227255395904, + "learning_rate": 7.97673082768105e-05, + "loss": 0.9233, + "step": 1444 + }, + { + "epoch": 0.6550317316409792, + "grad_norm": 1.3004682718554335, + "learning_rate": 7.97659428493314e-05, + "loss": 0.9332, + "step": 1445 + }, + { + "epoch": 0.6554850407978241, + "grad_norm": 1.1222152997595582, + "learning_rate": 7.976457343917623e-05, + "loss": 0.9408, + "step": 1446 + }, + { + "epoch": 0.655938349954669, + "grad_norm": 1.010423149381966, + "learning_rate": 7.976320004648218e-05, + "loss": 0.9328, + "step": 1447 + }, + { + "epoch": 0.6563916591115141, + "grad_norm": 1.0442575856032936, + "learning_rate": 7.976182267138678e-05, + "loss": 0.9139, + "step": 1448 + }, + { + "epoch": 0.656844968268359, + "grad_norm": 1.5087229332014087, + "learning_rate": 7.976044131402799e-05, + "loss": 0.923, + "step": 1449 + }, + { + "epoch": 0.657298277425204, + "grad_norm": 1.0492995955091606, + "learning_rate": 7.975905597454415e-05, + "loss": 0.9302, + "step": 1450 + }, + { + "epoch": 0.6577515865820489, + "grad_norm": 1.6268835563491717, + "learning_rate": 7.975766665307399e-05, + "loss": 0.9321, + "step": 1451 + }, + { + "epoch": 0.658204895738894, + "grad_norm": 1.434609139268748, + "learning_rate": 7.975627334975669e-05, + "loss": 0.9411, + "step": 1452 + }, + { + "epoch": 0.6586582048957389, + "grad_norm": 1.152359541686999, + "learning_rate": 7.975487606473175e-05, + "loss": 0.9208, + "step": 1453 + }, + { + "epoch": 0.6591115140525838, + "grad_norm": 1.275071016605143, + "learning_rate": 7.975347479813915e-05, + "loss": 0.9454, + "step": 1454 + }, + { + "epoch": 0.6595648232094289, + "grad_norm": 1.3747622696785118, + "learning_rate": 7.97520695501192e-05, + "loss": 0.9252, + "step": 1455 + }, + { + "epoch": 0.6600181323662738, + "grad_norm": 1.3638940251498712, + "learning_rate": 7.975066032081266e-05, + "loss": 0.9337, + "step": 1456 + }, + { + "epoch": 0.6604714415231188, + "grad_norm": 1.1981565751091483, + "learning_rate": 7.974924711036068e-05, + "loss": 0.9386, + "step": 1457 + }, + { + "epoch": 0.6609247506799637, + "grad_norm": 1.3092383105588519, + "learning_rate": 7.974782991890477e-05, + "loss": 0.9412, + "step": 1458 + }, + { + "epoch": 0.6613780598368088, + "grad_norm": 1.2775713219438978, + "learning_rate": 7.974640874658688e-05, + "loss": 0.9603, + "step": 1459 + }, + { + "epoch": 0.6618313689936537, + "grad_norm": 1.1866648580842087, + "learning_rate": 7.974498359354933e-05, + "loss": 0.9298, + "step": 1460 + }, + { + "epoch": 0.6622846781504986, + "grad_norm": 1.4329945995979836, + "learning_rate": 7.974355445993487e-05, + "loss": 0.9003, + "step": 1461 + }, + { + "epoch": 0.6627379873073436, + "grad_norm": 1.1897078145130777, + "learning_rate": 7.974212134588664e-05, + "loss": 0.9311, + "step": 1462 + }, + { + "epoch": 0.6631912964641886, + "grad_norm": 1.382182919223205, + "learning_rate": 7.974068425154814e-05, + "loss": 0.9431, + "step": 1463 + }, + { + "epoch": 0.6636446056210336, + "grad_norm": 1.1836673731075391, + "learning_rate": 7.973924317706333e-05, + "loss": 0.9184, + "step": 1464 + }, + { + "epoch": 0.6640979147778785, + "grad_norm": 1.403900630926551, + "learning_rate": 7.97377981225765e-05, + "loss": 0.9106, + "step": 1465 + }, + { + "epoch": 0.6645512239347234, + "grad_norm": 1.104040275923374, + "learning_rate": 7.973634908823243e-05, + "loss": 0.919, + "step": 1466 + }, + { + "epoch": 0.6650045330915685, + "grad_norm": 1.4111445792483353, + "learning_rate": 7.97348960741762e-05, + "loss": 0.9253, + "step": 1467 + }, + { + "epoch": 0.6654578422484134, + "grad_norm": 1.295177332094139, + "learning_rate": 7.973343908055336e-05, + "loss": 0.93, + "step": 1468 + }, + { + "epoch": 0.6659111514052584, + "grad_norm": 1.1154787319680681, + "learning_rate": 7.973197810750981e-05, + "loss": 0.907, + "step": 1469 + }, + { + "epoch": 0.6663644605621033, + "grad_norm": 1.405607701681695, + "learning_rate": 7.97305131551919e-05, + "loss": 0.9121, + "step": 1470 + }, + { + "epoch": 0.6668177697189483, + "grad_norm": 1.1353180137855903, + "learning_rate": 7.972904422374632e-05, + "loss": 0.924, + "step": 1471 + }, + { + "epoch": 0.6672710788757933, + "grad_norm": 1.3389026335076168, + "learning_rate": 7.97275713133202e-05, + "loss": 0.9131, + "step": 1472 + }, + { + "epoch": 0.6677243880326382, + "grad_norm": 1.373350731878485, + "learning_rate": 7.972609442406103e-05, + "loss": 0.9268, + "step": 1473 + }, + { + "epoch": 0.6681776971894833, + "grad_norm": 1.324180490903275, + "learning_rate": 7.972461355611679e-05, + "loss": 0.9102, + "step": 1474 + }, + { + "epoch": 0.6686310063463282, + "grad_norm": 0.9100181369526462, + "learning_rate": 7.972312870963575e-05, + "loss": 0.9222, + "step": 1475 + }, + { + "epoch": 0.6690843155031732, + "grad_norm": 1.1398206666275492, + "learning_rate": 7.97216398847666e-05, + "loss": 0.9205, + "step": 1476 + }, + { + "epoch": 0.6695376246600181, + "grad_norm": 1.6321757800330525, + "learning_rate": 7.97201470816585e-05, + "loss": 0.9219, + "step": 1477 + }, + { + "epoch": 0.6699909338168631, + "grad_norm": 1.0691110799792862, + "learning_rate": 7.971865030046091e-05, + "loss": 0.9153, + "step": 1478 + }, + { + "epoch": 0.6704442429737081, + "grad_norm": 1.4063599964918188, + "learning_rate": 7.971714954132379e-05, + "loss": 0.9475, + "step": 1479 + }, + { + "epoch": 0.670897552130553, + "grad_norm": 0.9548924711984405, + "learning_rate": 7.97156448043974e-05, + "loss": 0.8958, + "step": 1480 + }, + { + "epoch": 0.671350861287398, + "grad_norm": 1.4095897790969576, + "learning_rate": 7.971413608983247e-05, + "loss": 0.9282, + "step": 1481 + }, + { + "epoch": 0.671804170444243, + "grad_norm": 1.300863169704553, + "learning_rate": 7.971262339778008e-05, + "loss": 0.9176, + "step": 1482 + }, + { + "epoch": 0.672257479601088, + "grad_norm": 1.4993402999099295, + "learning_rate": 7.971110672839175e-05, + "loss": 0.9185, + "step": 1483 + }, + { + "epoch": 0.6727107887579329, + "grad_norm": 0.8135270923635025, + "learning_rate": 7.970958608181937e-05, + "loss": 0.9189, + "step": 1484 + }, + { + "epoch": 0.6731640979147778, + "grad_norm": 0.8555422604833746, + "learning_rate": 7.970806145821524e-05, + "loss": 0.9237, + "step": 1485 + }, + { + "epoch": 0.6736174070716229, + "grad_norm": 1.1357472013320014, + "learning_rate": 7.970653285773207e-05, + "loss": 0.947, + "step": 1486 + }, + { + "epoch": 0.6740707162284678, + "grad_norm": 1.6291106894648493, + "learning_rate": 7.970500028052292e-05, + "loss": 0.9343, + "step": 1487 + }, + { + "epoch": 0.6745240253853128, + "grad_norm": 0.9800402802610361, + "learning_rate": 7.970346372674131e-05, + "loss": 0.9177, + "step": 1488 + }, + { + "epoch": 0.6749773345421578, + "grad_norm": 1.6021452061858645, + "learning_rate": 7.970192319654112e-05, + "loss": 0.9235, + "step": 1489 + }, + { + "epoch": 0.6754306436990027, + "grad_norm": 0.8963986217221879, + "learning_rate": 7.970037869007664e-05, + "loss": 0.9227, + "step": 1490 + }, + { + "epoch": 0.6758839528558477, + "grad_norm": 1.568003514975853, + "learning_rate": 7.969883020750255e-05, + "loss": 0.9117, + "step": 1491 + }, + { + "epoch": 0.6763372620126926, + "grad_norm": 0.7963495238434515, + "learning_rate": 7.969727774897396e-05, + "loss": 0.9333, + "step": 1492 + }, + { + "epoch": 0.6767905711695377, + "grad_norm": 1.436774335982093, + "learning_rate": 7.96957213146463e-05, + "loss": 0.9245, + "step": 1493 + }, + { + "epoch": 0.6772438803263826, + "grad_norm": 1.2075870329133558, + "learning_rate": 7.969416090467553e-05, + "loss": 0.9282, + "step": 1494 + }, + { + "epoch": 0.6776971894832275, + "grad_norm": 1.152055897003922, + "learning_rate": 7.969259651921786e-05, + "loss": 0.9189, + "step": 1495 + }, + { + "epoch": 0.6781504986400725, + "grad_norm": 1.4679513233877814, + "learning_rate": 7.969102815843001e-05, + "loss": 0.9397, + "step": 1496 + }, + { + "epoch": 0.6786038077969175, + "grad_norm": 1.3133252538797986, + "learning_rate": 7.968945582246903e-05, + "loss": 0.916, + "step": 1497 + }, + { + "epoch": 0.6790571169537625, + "grad_norm": 1.4591851431123457, + "learning_rate": 7.96878795114924e-05, + "loss": 0.9164, + "step": 1498 + }, + { + "epoch": 0.6795104261106074, + "grad_norm": 0.9340497400217468, + "learning_rate": 7.9686299225658e-05, + "loss": 0.9361, + "step": 1499 + }, + { + "epoch": 0.6799637352674524, + "grad_norm": 1.301829269838523, + "learning_rate": 7.96847149651241e-05, + "loss": 0.9194, + "step": 1500 + }, + { + "epoch": 0.6804170444242974, + "grad_norm": 1.0748653925931202, + "learning_rate": 7.968312673004936e-05, + "loss": 0.9414, + "step": 1501 + }, + { + "epoch": 0.6808703535811423, + "grad_norm": 1.2738493254433245, + "learning_rate": 7.968153452059285e-05, + "loss": 0.9141, + "step": 1502 + }, + { + "epoch": 0.6813236627379873, + "grad_norm": 1.0661727000630012, + "learning_rate": 7.967993833691405e-05, + "loss": 0.9135, + "step": 1503 + }, + { + "epoch": 0.6817769718948323, + "grad_norm": 1.5119080456386675, + "learning_rate": 7.96783381791728e-05, + "loss": 0.9457, + "step": 1504 + }, + { + "epoch": 0.6822302810516773, + "grad_norm": 1.279320549430296, + "learning_rate": 7.967673404752937e-05, + "loss": 0.9168, + "step": 1505 + }, + { + "epoch": 0.6826835902085222, + "grad_norm": 1.0515337888292569, + "learning_rate": 7.967512594214441e-05, + "loss": 0.9215, + "step": 1506 + }, + { + "epoch": 0.6831368993653671, + "grad_norm": 1.6463945267980995, + "learning_rate": 7.967351386317899e-05, + "loss": 0.9142, + "step": 1507 + }, + { + "epoch": 0.6835902085222122, + "grad_norm": 1.1761736050717322, + "learning_rate": 7.967189781079456e-05, + "loss": 0.9301, + "step": 1508 + }, + { + "epoch": 0.6840435176790571, + "grad_norm": 1.1027548558801048, + "learning_rate": 7.967027778515297e-05, + "loss": 0.9368, + "step": 1509 + }, + { + "epoch": 0.6844968268359021, + "grad_norm": 1.255542151966555, + "learning_rate": 7.966865378641647e-05, + "loss": 0.918, + "step": 1510 + }, + { + "epoch": 0.684950135992747, + "grad_norm": 0.8458071359348894, + "learning_rate": 7.966702581474771e-05, + "loss": 0.9086, + "step": 1511 + }, + { + "epoch": 0.6854034451495921, + "grad_norm": 1.3613545985066633, + "learning_rate": 7.966539387030973e-05, + "loss": 0.9254, + "step": 1512 + }, + { + "epoch": 0.685856754306437, + "grad_norm": 1.450391164240738, + "learning_rate": 7.966375795326599e-05, + "loss": 0.9178, + "step": 1513 + }, + { + "epoch": 0.6863100634632819, + "grad_norm": 1.3428017085178099, + "learning_rate": 7.966211806378031e-05, + "loss": 0.9326, + "step": 1514 + }, + { + "epoch": 0.6867633726201269, + "grad_norm": 1.1933103355423667, + "learning_rate": 7.966047420201695e-05, + "loss": 0.9358, + "step": 1515 + }, + { + "epoch": 0.6872166817769719, + "grad_norm": 1.1402047730851161, + "learning_rate": 7.965882636814053e-05, + "loss": 0.9142, + "step": 1516 + }, + { + "epoch": 0.6876699909338169, + "grad_norm": 1.0385513011125143, + "learning_rate": 7.96571745623161e-05, + "loss": 0.9285, + "step": 1517 + }, + { + "epoch": 0.6881233000906618, + "grad_norm": 1.2881533460323966, + "learning_rate": 7.965551878470909e-05, + "loss": 0.9345, + "step": 1518 + }, + { + "epoch": 0.6885766092475069, + "grad_norm": 1.2111539197118497, + "learning_rate": 7.965385903548531e-05, + "loss": 0.9129, + "step": 1519 + }, + { + "epoch": 0.6890299184043518, + "grad_norm": 1.4794832941761022, + "learning_rate": 7.965219531481103e-05, + "loss": 0.9101, + "step": 1520 + }, + { + "epoch": 0.6894832275611967, + "grad_norm": 1.1762039870526118, + "learning_rate": 7.965052762285285e-05, + "loss": 0.9268, + "step": 1521 + }, + { + "epoch": 0.6899365367180417, + "grad_norm": 1.3548672335838676, + "learning_rate": 7.964885595977778e-05, + "loss": 0.9315, + "step": 1522 + }, + { + "epoch": 0.6903898458748867, + "grad_norm": 1.695712155641778, + "learning_rate": 7.964718032575328e-05, + "loss": 0.9335, + "step": 1523 + }, + { + "epoch": 0.6908431550317317, + "grad_norm": 0.6282614902121157, + "learning_rate": 7.964550072094714e-05, + "loss": 0.9274, + "step": 1524 + }, + { + "epoch": 0.6912964641885766, + "grad_norm": 1.6417997166383307, + "learning_rate": 7.964381714552759e-05, + "loss": 0.9339, + "step": 1525 + }, + { + "epoch": 0.6917497733454215, + "grad_norm": 1.5480116528684826, + "learning_rate": 7.964212959966324e-05, + "loss": 0.9563, + "step": 1526 + }, + { + "epoch": 0.6922030825022666, + "grad_norm": 0.8009024921884106, + "learning_rate": 7.96404380835231e-05, + "loss": 0.9197, + "step": 1527 + }, + { + "epoch": 0.6926563916591115, + "grad_norm": 1.4332514144118524, + "learning_rate": 7.96387425972766e-05, + "loss": 0.9177, + "step": 1528 + }, + { + "epoch": 0.6931097008159565, + "grad_norm": 1.1195741622053257, + "learning_rate": 7.963704314109352e-05, + "loss": 0.897, + "step": 1529 + }, + { + "epoch": 0.6935630099728014, + "grad_norm": 1.1922109245131962, + "learning_rate": 7.963533971514407e-05, + "loss": 0.9292, + "step": 1530 + }, + { + "epoch": 0.6940163191296465, + "grad_norm": 1.3501485944758176, + "learning_rate": 7.963363231959888e-05, + "loss": 0.9163, + "step": 1531 + }, + { + "epoch": 0.6944696282864914, + "grad_norm": 1.0904475231874073, + "learning_rate": 7.963192095462893e-05, + "loss": 0.9266, + "step": 1532 + }, + { + "epoch": 0.6949229374433363, + "grad_norm": 1.275256355821671, + "learning_rate": 7.963020562040561e-05, + "loss": 0.9246, + "step": 1533 + }, + { + "epoch": 0.6953762466001813, + "grad_norm": 1.0247707258329817, + "learning_rate": 7.962848631710073e-05, + "loss": 0.9209, + "step": 1534 + }, + { + "epoch": 0.6958295557570263, + "grad_norm": 1.4914533477544665, + "learning_rate": 7.962676304488649e-05, + "loss": 0.9341, + "step": 1535 + }, + { + "epoch": 0.6962828649138713, + "grad_norm": 1.388342477921883, + "learning_rate": 7.962503580393546e-05, + "loss": 0.9221, + "step": 1536 + }, + { + "epoch": 0.6967361740707162, + "grad_norm": 1.4582813777650665, + "learning_rate": 7.962330459442063e-05, + "loss": 0.9259, + "step": 1537 + }, + { + "epoch": 0.6971894832275612, + "grad_norm": 0.9661722660132974, + "learning_rate": 7.962156941651539e-05, + "loss": 0.9089, + "step": 1538 + }, + { + "epoch": 0.6976427923844062, + "grad_norm": 1.7048772714952134, + "learning_rate": 7.961983027039356e-05, + "loss": 0.9047, + "step": 1539 + }, + { + "epoch": 0.6980961015412511, + "grad_norm": 0.8187451298745775, + "learning_rate": 7.961808715622926e-05, + "loss": 0.8927, + "step": 1540 + }, + { + "epoch": 0.6985494106980961, + "grad_norm": 1.5864101154372818, + "learning_rate": 7.96163400741971e-05, + "loss": 0.9125, + "step": 1541 + }, + { + "epoch": 0.6990027198549411, + "grad_norm": 0.9956114535329907, + "learning_rate": 7.961458902447206e-05, + "loss": 0.9349, + "step": 1542 + }, + { + "epoch": 0.699456029011786, + "grad_norm": 1.6800659628699306, + "learning_rate": 7.96128340072295e-05, + "loss": 0.9255, + "step": 1543 + }, + { + "epoch": 0.699909338168631, + "grad_norm": 1.1004440480411313, + "learning_rate": 7.96110750226452e-05, + "loss": 0.9305, + "step": 1544 + }, + { + "epoch": 0.7003626473254759, + "grad_norm": 2.0529920561143915, + "learning_rate": 7.96093120708953e-05, + "loss": 0.9132, + "step": 1545 + }, + { + "epoch": 0.700815956482321, + "grad_norm": 1.7491498743857656, + "learning_rate": 7.960754515215641e-05, + "loss": 0.9264, + "step": 1546 + }, + { + "epoch": 0.7012692656391659, + "grad_norm": 1.2663240740834958, + "learning_rate": 7.960577426660547e-05, + "loss": 0.9227, + "step": 1547 + }, + { + "epoch": 0.7017225747960109, + "grad_norm": 1.6982063174876354, + "learning_rate": 7.960399941441982e-05, + "loss": 0.934, + "step": 1548 + }, + { + "epoch": 0.7021758839528558, + "grad_norm": 1.0108445558017256, + "learning_rate": 7.960222059577724e-05, + "loss": 0.9416, + "step": 1549 + }, + { + "epoch": 0.7026291931097008, + "grad_norm": 1.7882289612233115, + "learning_rate": 7.960043781085588e-05, + "loss": 0.9331, + "step": 1550 + }, + { + "epoch": 0.7030825022665458, + "grad_norm": 1.4991008584259256, + "learning_rate": 7.959865105983429e-05, + "loss": 0.9269, + "step": 1551 + }, + { + "epoch": 0.7035358114233907, + "grad_norm": 1.470941404043774, + "learning_rate": 7.959686034289143e-05, + "loss": 0.919, + "step": 1552 + }, + { + "epoch": 0.7039891205802358, + "grad_norm": 1.5493392200395835, + "learning_rate": 7.959506566020661e-05, + "loss": 0.9078, + "step": 1553 + }, + { + "epoch": 0.7044424297370807, + "grad_norm": 1.207564299086569, + "learning_rate": 7.959326701195962e-05, + "loss": 0.9274, + "step": 1554 + }, + { + "epoch": 0.7048957388939256, + "grad_norm": 1.7186556059131886, + "learning_rate": 7.959146439833056e-05, + "loss": 0.9393, + "step": 1555 + }, + { + "epoch": 0.7053490480507706, + "grad_norm": 1.1000951526346143, + "learning_rate": 7.95896578195e-05, + "loss": 0.9312, + "step": 1556 + }, + { + "epoch": 0.7058023572076156, + "grad_norm": 1.413641672638118, + "learning_rate": 7.958784727564885e-05, + "loss": 0.9439, + "step": 1557 + }, + { + "epoch": 0.7062556663644606, + "grad_norm": 1.262466218915977, + "learning_rate": 7.958603276695844e-05, + "loss": 0.9177, + "step": 1558 + }, + { + "epoch": 0.7067089755213055, + "grad_norm": 1.0056998750155368, + "learning_rate": 7.958421429361051e-05, + "loss": 0.9314, + "step": 1559 + }, + { + "epoch": 0.7071622846781505, + "grad_norm": 1.7201027897721712, + "learning_rate": 7.958239185578718e-05, + "loss": 0.9297, + "step": 1560 + }, + { + "epoch": 0.7076155938349955, + "grad_norm": 1.3087421388934883, + "learning_rate": 7.9580565453671e-05, + "loss": 0.9221, + "step": 1561 + }, + { + "epoch": 0.7080689029918404, + "grad_norm": 1.8017926198478544, + "learning_rate": 7.957873508744483e-05, + "loss": 0.9215, + "step": 1562 + }, + { + "epoch": 0.7085222121486854, + "grad_norm": 1.3442120007254794, + "learning_rate": 7.957690075729204e-05, + "loss": 0.9065, + "step": 1563 + }, + { + "epoch": 0.7089755213055303, + "grad_norm": 1.69517653924518, + "learning_rate": 7.957506246339631e-05, + "loss": 0.9141, + "step": 1564 + }, + { + "epoch": 0.7094288304623754, + "grad_norm": 1.3037448072054871, + "learning_rate": 7.957322020594178e-05, + "loss": 0.9034, + "step": 1565 + }, + { + "epoch": 0.7098821396192203, + "grad_norm": 1.8197518205065386, + "learning_rate": 7.957137398511294e-05, + "loss": 0.9293, + "step": 1566 + }, + { + "epoch": 0.7103354487760652, + "grad_norm": 1.545137758794244, + "learning_rate": 7.95695238010947e-05, + "loss": 0.923, + "step": 1567 + }, + { + "epoch": 0.7107887579329103, + "grad_norm": 1.4728903059560132, + "learning_rate": 7.956766965407235e-05, + "loss": 0.9318, + "step": 1568 + }, + { + "epoch": 0.7112420670897552, + "grad_norm": 1.337121231153321, + "learning_rate": 7.956581154423161e-05, + "loss": 0.9267, + "step": 1569 + }, + { + "epoch": 0.7116953762466002, + "grad_norm": 1.4306505531205653, + "learning_rate": 7.956394947175855e-05, + "loss": 0.9284, + "step": 1570 + }, + { + "epoch": 0.7121486854034451, + "grad_norm": 1.1227059238548984, + "learning_rate": 7.956208343683968e-05, + "loss": 0.9136, + "step": 1571 + }, + { + "epoch": 0.7126019945602902, + "grad_norm": 1.5626711476184012, + "learning_rate": 7.956021343966188e-05, + "loss": 0.9297, + "step": 1572 + }, + { + "epoch": 0.7130553037171351, + "grad_norm": 1.2199248538016538, + "learning_rate": 7.955833948041245e-05, + "loss": 0.934, + "step": 1573 + }, + { + "epoch": 0.71350861287398, + "grad_norm": 1.5746775926831904, + "learning_rate": 7.955646155927903e-05, + "loss": 0.9317, + "step": 1574 + }, + { + "epoch": 0.713961922030825, + "grad_norm": 1.2832423458639428, + "learning_rate": 7.955457967644976e-05, + "loss": 0.9289, + "step": 1575 + }, + { + "epoch": 0.71441523118767, + "grad_norm": 1.3186729377921877, + "learning_rate": 7.955269383211309e-05, + "loss": 0.9246, + "step": 1576 + }, + { + "epoch": 0.714868540344515, + "grad_norm": 1.3468244005861185, + "learning_rate": 7.955080402645786e-05, + "loss": 0.9274, + "step": 1577 + }, + { + "epoch": 0.7153218495013599, + "grad_norm": 1.4321268774150047, + "learning_rate": 7.954891025967339e-05, + "loss": 0.9124, + "step": 1578 + }, + { + "epoch": 0.7157751586582048, + "grad_norm": 1.1385986106825017, + "learning_rate": 7.954701253194932e-05, + "loss": 0.913, + "step": 1579 + }, + { + "epoch": 0.7162284678150499, + "grad_norm": 1.1239357549832045, + "learning_rate": 7.954511084347573e-05, + "loss": 0.9348, + "step": 1580 + }, + { + "epoch": 0.7166817769718948, + "grad_norm": 1.634241588716829, + "learning_rate": 7.954320519444306e-05, + "loss": 0.9393, + "step": 1581 + }, + { + "epoch": 0.7171350861287398, + "grad_norm": 1.286046306669166, + "learning_rate": 7.954129558504218e-05, + "loss": 0.9161, + "step": 1582 + }, + { + "epoch": 0.7175883952855848, + "grad_norm": 1.1258300707260802, + "learning_rate": 7.953938201546433e-05, + "loss": 0.9362, + "step": 1583 + }, + { + "epoch": 0.7180417044424298, + "grad_norm": 0.7990104918191255, + "learning_rate": 7.953746448590115e-05, + "loss": 0.9166, + "step": 1584 + }, + { + "epoch": 0.7184950135992747, + "grad_norm": 1.2081486370465535, + "learning_rate": 7.953554299654473e-05, + "loss": 0.9137, + "step": 1585 + }, + { + "epoch": 0.7189483227561196, + "grad_norm": 1.4046554229003951, + "learning_rate": 7.953361754758747e-05, + "loss": 0.9525, + "step": 1586 + }, + { + "epoch": 0.7194016319129647, + "grad_norm": 1.340047119426043, + "learning_rate": 7.953168813922225e-05, + "loss": 0.9576, + "step": 1587 + }, + { + "epoch": 0.7198549410698096, + "grad_norm": 1.0007700379652889, + "learning_rate": 7.952975477164226e-05, + "loss": 0.9115, + "step": 1588 + }, + { + "epoch": 0.7203082502266546, + "grad_norm": 1.8016114548086035, + "learning_rate": 7.952781744504115e-05, + "loss": 0.9078, + "step": 1589 + }, + { + "epoch": 0.7207615593834995, + "grad_norm": 1.1137649424925504, + "learning_rate": 7.952587615961297e-05, + "loss": 0.9373, + "step": 1590 + }, + { + "epoch": 0.7212148685403446, + "grad_norm": 1.2202961906059926, + "learning_rate": 7.952393091555211e-05, + "loss": 0.936, + "step": 1591 + }, + { + "epoch": 0.7216681776971895, + "grad_norm": 1.5876694060576904, + "learning_rate": 7.952198171305342e-05, + "loss": 0.9109, + "step": 1592 + }, + { + "epoch": 0.7221214868540344, + "grad_norm": 0.911805360479133, + "learning_rate": 7.952002855231211e-05, + "loss": 0.9208, + "step": 1593 + }, + { + "epoch": 0.7225747960108794, + "grad_norm": 1.8170330280243425, + "learning_rate": 7.951807143352379e-05, + "loss": 0.9378, + "step": 1594 + }, + { + "epoch": 0.7230281051677244, + "grad_norm": 1.066533853604382, + "learning_rate": 7.951611035688447e-05, + "loss": 0.932, + "step": 1595 + }, + { + "epoch": 0.7234814143245694, + "grad_norm": 1.9402484332759569, + "learning_rate": 7.951414532259056e-05, + "loss": 0.948, + "step": 1596 + }, + { + "epoch": 0.7239347234814143, + "grad_norm": 1.5190925698007571, + "learning_rate": 7.951217633083888e-05, + "loss": 0.9439, + "step": 1597 + }, + { + "epoch": 0.7243880326382592, + "grad_norm": 1.3978651455550268, + "learning_rate": 7.951020338182661e-05, + "loss": 0.9344, + "step": 1598 + }, + { + "epoch": 0.7248413417951043, + "grad_norm": 1.2580379239658595, + "learning_rate": 7.950822647575136e-05, + "loss": 0.9317, + "step": 1599 + }, + { + "epoch": 0.7252946509519492, + "grad_norm": 1.1958836735490492, + "learning_rate": 7.95062456128111e-05, + "loss": 0.9301, + "step": 1600 + }, + { + "epoch": 0.7257479601087942, + "grad_norm": 1.3390547132824158, + "learning_rate": 7.950426079320426e-05, + "loss": 0.9282, + "step": 1601 + }, + { + "epoch": 0.7262012692656392, + "grad_norm": 1.0677794464942607, + "learning_rate": 7.950227201712958e-05, + "loss": 0.9226, + "step": 1602 + }, + { + "epoch": 0.7266545784224842, + "grad_norm": 1.650453563800068, + "learning_rate": 7.950027928478626e-05, + "loss": 0.9291, + "step": 1603 + }, + { + "epoch": 0.7271078875793291, + "grad_norm": 1.1822205603930291, + "learning_rate": 7.949828259637389e-05, + "loss": 0.9191, + "step": 1604 + }, + { + "epoch": 0.727561196736174, + "grad_norm": 1.2227479869034787, + "learning_rate": 7.949628195209242e-05, + "loss": 0.9106, + "step": 1605 + }, + { + "epoch": 0.7280145058930191, + "grad_norm": 1.1223480536684485, + "learning_rate": 7.949427735214226e-05, + "loss": 0.9361, + "step": 1606 + }, + { + "epoch": 0.728467815049864, + "grad_norm": 1.5493655098866297, + "learning_rate": 7.949226879672413e-05, + "loss": 0.9119, + "step": 1607 + }, + { + "epoch": 0.728921124206709, + "grad_norm": 1.0234500386221204, + "learning_rate": 7.949025628603922e-05, + "loss": 0.9327, + "step": 1608 + }, + { + "epoch": 0.7293744333635539, + "grad_norm": 1.492547706848186, + "learning_rate": 7.94882398202891e-05, + "loss": 0.926, + "step": 1609 + }, + { + "epoch": 0.729827742520399, + "grad_norm": 1.2332803017498088, + "learning_rate": 7.948621939967569e-05, + "loss": 0.9105, + "step": 1610 + }, + { + "epoch": 0.7302810516772439, + "grad_norm": 1.4086917332508235, + "learning_rate": 7.948419502440136e-05, + "loss": 0.9264, + "step": 1611 + }, + { + "epoch": 0.7307343608340888, + "grad_norm": 1.1172030351308644, + "learning_rate": 7.948216669466886e-05, + "loss": 0.9089, + "step": 1612 + }, + { + "epoch": 0.7311876699909338, + "grad_norm": 1.3503135750035187, + "learning_rate": 7.948013441068133e-05, + "loss": 0.923, + "step": 1613 + }, + { + "epoch": 0.7316409791477788, + "grad_norm": 1.2259622128138963, + "learning_rate": 7.947809817264231e-05, + "loss": 0.9231, + "step": 1614 + }, + { + "epoch": 0.7320942883046238, + "grad_norm": 1.4232173729274333, + "learning_rate": 7.947605798075573e-05, + "loss": 0.9193, + "step": 1615 + }, + { + "epoch": 0.7325475974614687, + "grad_norm": 1.6651003074157964, + "learning_rate": 7.947401383522593e-05, + "loss": 0.9239, + "step": 1616 + }, + { + "epoch": 0.7330009066183137, + "grad_norm": 0.9547568321963682, + "learning_rate": 7.947196573625763e-05, + "loss": 0.9055, + "step": 1617 + }, + { + "epoch": 0.7334542157751587, + "grad_norm": 1.1535682545394768, + "learning_rate": 7.946991368405595e-05, + "loss": 0.8959, + "step": 1618 + }, + { + "epoch": 0.7339075249320036, + "grad_norm": 1.1977697753150058, + "learning_rate": 7.946785767882643e-05, + "loss": 0.9199, + "step": 1619 + }, + { + "epoch": 0.7343608340888486, + "grad_norm": 1.622562549131728, + "learning_rate": 7.946579772077496e-05, + "loss": 0.9161, + "step": 1620 + }, + { + "epoch": 0.7348141432456936, + "grad_norm": 1.619497455820455, + "learning_rate": 7.946373381010786e-05, + "loss": 0.9382, + "step": 1621 + }, + { + "epoch": 0.7352674524025385, + "grad_norm": 0.824707615481893, + "learning_rate": 7.946166594703183e-05, + "loss": 0.9083, + "step": 1622 + }, + { + "epoch": 0.7357207615593835, + "grad_norm": 2.6767244545028452, + "learning_rate": 7.9459594131754e-05, + "loss": 0.9513, + "step": 1623 + }, + { + "epoch": 0.7361740707162284, + "grad_norm": 1.5565079779736144, + "learning_rate": 7.945751836448183e-05, + "loss": 0.9203, + "step": 1624 + }, + { + "epoch": 0.7366273798730735, + "grad_norm": 2.5905223467695744, + "learning_rate": 7.945543864542324e-05, + "loss": 0.906, + "step": 1625 + }, + { + "epoch": 0.7370806890299184, + "grad_norm": 1.5067643477298034, + "learning_rate": 7.94533549747865e-05, + "loss": 0.9336, + "step": 1626 + }, + { + "epoch": 0.7375339981867634, + "grad_norm": 3.0763962990680787, + "learning_rate": 7.945126735278032e-05, + "loss": 0.9254, + "step": 1627 + }, + { + "epoch": 0.7379873073436083, + "grad_norm": 2.2353150935551858, + "learning_rate": 7.944917577961377e-05, + "loss": 0.9255, + "step": 1628 + }, + { + "epoch": 0.7384406165004533, + "grad_norm": 2.9059403792442136, + "learning_rate": 7.944708025549633e-05, + "loss": 0.9362, + "step": 1629 + }, + { + "epoch": 0.7388939256572983, + "grad_norm": 2.7283581662552407, + "learning_rate": 7.944498078063785e-05, + "loss": 0.9188, + "step": 1630 + }, + { + "epoch": 0.7393472348141432, + "grad_norm": 1.996717425245677, + "learning_rate": 7.944287735524864e-05, + "loss": 0.8976, + "step": 1631 + }, + { + "epoch": 0.7398005439709883, + "grad_norm": 2.0292707392657685, + "learning_rate": 7.944076997953933e-05, + "loss": 0.9152, + "step": 1632 + }, + { + "epoch": 0.7402538531278332, + "grad_norm": 1.9673653874135555, + "learning_rate": 7.943865865372101e-05, + "loss": 0.9179, + "step": 1633 + }, + { + "epoch": 0.7407071622846781, + "grad_norm": 1.5470466704608743, + "learning_rate": 7.94365433780051e-05, + "loss": 0.9275, + "step": 1634 + }, + { + "epoch": 0.7411604714415231, + "grad_norm": 2.563345794340878, + "learning_rate": 7.943442415260347e-05, + "loss": 0.9445, + "step": 1635 + }, + { + "epoch": 0.7416137805983681, + "grad_norm": 1.9136668260515428, + "learning_rate": 7.943230097772837e-05, + "loss": 0.8974, + "step": 1636 + }, + { + "epoch": 0.7420670897552131, + "grad_norm": 2.5924886938470455, + "learning_rate": 7.943017385359243e-05, + "loss": 0.9108, + "step": 1637 + }, + { + "epoch": 0.742520398912058, + "grad_norm": 2.4536383216965634, + "learning_rate": 7.94280427804087e-05, + "loss": 0.929, + "step": 1638 + }, + { + "epoch": 0.742973708068903, + "grad_norm": 1.8114302544939989, + "learning_rate": 7.942590775839061e-05, + "loss": 0.9285, + "step": 1639 + }, + { + "epoch": 0.743427017225748, + "grad_norm": 1.6244613942470991, + "learning_rate": 7.9423768787752e-05, + "loss": 0.9032, + "step": 1640 + }, + { + "epoch": 0.7438803263825929, + "grad_norm": 2.3530250272162805, + "learning_rate": 7.942162586870706e-05, + "loss": 0.9161, + "step": 1641 + }, + { + "epoch": 0.7443336355394379, + "grad_norm": 1.8584388808978303, + "learning_rate": 7.941947900147044e-05, + "loss": 0.9272, + "step": 1642 + }, + { + "epoch": 0.7447869446962828, + "grad_norm": 2.4582610775265463, + "learning_rate": 7.941732818625716e-05, + "loss": 0.9184, + "step": 1643 + }, + { + "epoch": 0.7452402538531279, + "grad_norm": 2.2643593517328635, + "learning_rate": 7.94151734232826e-05, + "loss": 0.9353, + "step": 1644 + }, + { + "epoch": 0.7456935630099728, + "grad_norm": 1.9524050391304608, + "learning_rate": 7.941301471276258e-05, + "loss": 0.9293, + "step": 1645 + }, + { + "epoch": 0.7461468721668177, + "grad_norm": 1.6916660364723919, + "learning_rate": 7.94108520549133e-05, + "loss": 0.9129, + "step": 1646 + }, + { + "epoch": 0.7466001813236628, + "grad_norm": 2.2214102393711808, + "learning_rate": 7.940868544995138e-05, + "loss": 0.9177, + "step": 1647 + }, + { + "epoch": 0.7470534904805077, + "grad_norm": 1.7438225939604073, + "learning_rate": 7.940651489809379e-05, + "loss": 0.9209, + "step": 1648 + }, + { + "epoch": 0.7475067996373527, + "grad_norm": 2.369305136182219, + "learning_rate": 7.940434039955791e-05, + "loss": 0.9491, + "step": 1649 + }, + { + "epoch": 0.7479601087941976, + "grad_norm": 2.01098161749891, + "learning_rate": 7.940216195456153e-05, + "loss": 0.9257, + "step": 1650 + }, + { + "epoch": 0.7484134179510427, + "grad_norm": 2.0615866540966965, + "learning_rate": 7.939997956332283e-05, + "loss": 0.9127, + "step": 1651 + }, + { + "epoch": 0.7488667271078876, + "grad_norm": 1.7731931889637351, + "learning_rate": 7.939779322606038e-05, + "loss": 0.929, + "step": 1652 + }, + { + "epoch": 0.7493200362647325, + "grad_norm": 2.180067268600542, + "learning_rate": 7.939560294299316e-05, + "loss": 0.9073, + "step": 1653 + }, + { + "epoch": 0.7497733454215775, + "grad_norm": 1.7900524959049056, + "learning_rate": 7.939340871434052e-05, + "loss": 0.92, + "step": 1654 + }, + { + "epoch": 0.7502266545784225, + "grad_norm": 2.277986104443345, + "learning_rate": 7.939121054032222e-05, + "loss": 0.9207, + "step": 1655 + }, + { + "epoch": 0.7506799637352675, + "grad_norm": 1.8719486926156035, + "learning_rate": 7.938900842115842e-05, + "loss": 0.9303, + "step": 1656 + }, + { + "epoch": 0.7511332728921124, + "grad_norm": 2.1175402153031624, + "learning_rate": 7.938680235706966e-05, + "loss": 0.9241, + "step": 1657 + }, + { + "epoch": 0.7515865820489573, + "grad_norm": 1.8909093687470349, + "learning_rate": 7.93845923482769e-05, + "loss": 0.9411, + "step": 1658 + }, + { + "epoch": 0.7520398912058024, + "grad_norm": 2.0361710305834535, + "learning_rate": 7.938237839500146e-05, + "loss": 0.9206, + "step": 1659 + }, + { + "epoch": 0.7524932003626473, + "grad_norm": 1.7324229312635189, + "learning_rate": 7.938016049746508e-05, + "loss": 0.9018, + "step": 1660 + }, + { + "epoch": 0.7529465095194923, + "grad_norm": 2.1450181548836005, + "learning_rate": 7.937793865588988e-05, + "loss": 0.9261, + "step": 1661 + }, + { + "epoch": 0.7533998186763372, + "grad_norm": 1.7843965199903666, + "learning_rate": 7.93757128704984e-05, + "loss": 0.9089, + "step": 1662 + }, + { + "epoch": 0.7538531278331823, + "grad_norm": 2.299898765797171, + "learning_rate": 7.937348314151356e-05, + "loss": 0.9379, + "step": 1663 + }, + { + "epoch": 0.7543064369900272, + "grad_norm": 2.0620894526521423, + "learning_rate": 7.937124946915867e-05, + "loss": 0.9361, + "step": 1664 + }, + { + "epoch": 0.7547597461468721, + "grad_norm": 1.9048367529610168, + "learning_rate": 7.936901185365742e-05, + "loss": 0.9182, + "step": 1665 + }, + { + "epoch": 0.7552130553037172, + "grad_norm": 1.6611039239393344, + "learning_rate": 7.936677029523394e-05, + "loss": 0.9338, + "step": 1666 + }, + { + "epoch": 0.7556663644605621, + "grad_norm": 2.102048030231007, + "learning_rate": 7.936452479411272e-05, + "loss": 0.9069, + "step": 1667 + }, + { + "epoch": 0.7561196736174071, + "grad_norm": 1.7620393819835651, + "learning_rate": 7.936227535051863e-05, + "loss": 0.9212, + "step": 1668 + }, + { + "epoch": 0.756572982774252, + "grad_norm": 2.2512977771902256, + "learning_rate": 7.9360021964677e-05, + "loss": 0.9229, + "step": 1669 + }, + { + "epoch": 0.757026291931097, + "grad_norm": 2.015352319394172, + "learning_rate": 7.93577646368135e-05, + "loss": 0.9408, + "step": 1670 + }, + { + "epoch": 0.757479601087942, + "grad_norm": 1.8535425171906466, + "learning_rate": 7.935550336715418e-05, + "loss": 0.9121, + "step": 1671 + }, + { + "epoch": 0.7579329102447869, + "grad_norm": 1.5669436455360037, + "learning_rate": 7.935323815592556e-05, + "loss": 0.9205, + "step": 1672 + }, + { + "epoch": 0.7583862194016319, + "grad_norm": 2.3113617324149756, + "learning_rate": 7.935096900335446e-05, + "loss": 0.9174, + "step": 1673 + }, + { + "epoch": 0.7588395285584769, + "grad_norm": 1.9216312865218204, + "learning_rate": 7.934869590966816e-05, + "loss": 0.9202, + "step": 1674 + }, + { + "epoch": 0.7592928377153219, + "grad_norm": 1.988642637785182, + "learning_rate": 7.934641887509434e-05, + "loss": 0.916, + "step": 1675 + }, + { + "epoch": 0.7597461468721668, + "grad_norm": 1.5998802343272702, + "learning_rate": 7.934413789986104e-05, + "loss": 0.909, + "step": 1676 + }, + { + "epoch": 0.7601994560290117, + "grad_norm": 2.294474321784248, + "learning_rate": 7.934185298419667e-05, + "loss": 0.9228, + "step": 1677 + }, + { + "epoch": 0.7606527651858568, + "grad_norm": 1.9116528021499337, + "learning_rate": 7.933956412833012e-05, + "loss": 0.9034, + "step": 1678 + }, + { + "epoch": 0.7611060743427017, + "grad_norm": 1.9371458240693507, + "learning_rate": 7.93372713324906e-05, + "loss": 0.8914, + "step": 1679 + }, + { + "epoch": 0.7615593834995467, + "grad_norm": 1.670027982721587, + "learning_rate": 7.933497459690774e-05, + "loss": 0.9165, + "step": 1680 + }, + { + "epoch": 0.7620126926563917, + "grad_norm": 2.0939037970480383, + "learning_rate": 7.933267392181158e-05, + "loss": 0.9104, + "step": 1681 + }, + { + "epoch": 0.7624660018132366, + "grad_norm": 1.7057591480371763, + "learning_rate": 7.933036930743254e-05, + "loss": 0.9308, + "step": 1682 + }, + { + "epoch": 0.7629193109700816, + "grad_norm": 2.1059271346017865, + "learning_rate": 7.93280607540014e-05, + "loss": 0.9135, + "step": 1683 + }, + { + "epoch": 0.7633726201269265, + "grad_norm": 1.7272173438059368, + "learning_rate": 7.93257482617494e-05, + "loss": 0.9051, + "step": 1684 + }, + { + "epoch": 0.7638259292837716, + "grad_norm": 2.076928966559462, + "learning_rate": 7.932343183090814e-05, + "loss": 0.9116, + "step": 1685 + }, + { + "epoch": 0.7642792384406165, + "grad_norm": 1.767246601566885, + "learning_rate": 7.932111146170961e-05, + "loss": 0.9304, + "step": 1686 + }, + { + "epoch": 0.7647325475974615, + "grad_norm": 2.0773705998058785, + "learning_rate": 7.93187871543862e-05, + "loss": 0.9034, + "step": 1687 + }, + { + "epoch": 0.7651858567543064, + "grad_norm": 1.7156429733025549, + "learning_rate": 7.931645890917073e-05, + "loss": 0.8961, + "step": 1688 + }, + { + "epoch": 0.7656391659111514, + "grad_norm": 2.1095110458418267, + "learning_rate": 7.93141267262963e-05, + "loss": 0.9313, + "step": 1689 + }, + { + "epoch": 0.7660924750679964, + "grad_norm": 1.7808566243720418, + "learning_rate": 7.931179060599658e-05, + "loss": 0.9276, + "step": 1690 + }, + { + "epoch": 0.7665457842248413, + "grad_norm": 1.9722931686024765, + "learning_rate": 7.930945054850547e-05, + "loss": 0.8984, + "step": 1691 + }, + { + "epoch": 0.7669990933816863, + "grad_norm": 1.7297142886084518, + "learning_rate": 7.930710655405737e-05, + "loss": 0.9423, + "step": 1692 + }, + { + "epoch": 0.7674524025385313, + "grad_norm": 2.09795530852484, + "learning_rate": 7.930475862288703e-05, + "loss": 0.9084, + "step": 1693 + }, + { + "epoch": 0.7679057116953762, + "grad_norm": 1.8130848712015828, + "learning_rate": 7.93024067552296e-05, + "loss": 0.9174, + "step": 1694 + }, + { + "epoch": 0.7683590208522212, + "grad_norm": 1.9486354746804728, + "learning_rate": 7.930005095132063e-05, + "loss": 0.8859, + "step": 1695 + }, + { + "epoch": 0.7688123300090662, + "grad_norm": 1.6610830244142931, + "learning_rate": 7.929769121139605e-05, + "loss": 0.9079, + "step": 1696 + }, + { + "epoch": 0.7692656391659112, + "grad_norm": 2.0460180687717573, + "learning_rate": 7.929532753569218e-05, + "loss": 0.8912, + "step": 1697 + }, + { + "epoch": 0.7697189483227561, + "grad_norm": 1.79591691277189, + "learning_rate": 7.92929599244458e-05, + "loss": 0.8982, + "step": 1698 + }, + { + "epoch": 0.770172257479601, + "grad_norm": 1.8483299038463623, + "learning_rate": 7.9290588377894e-05, + "loss": 0.9154, + "step": 1699 + }, + { + "epoch": 0.7706255666364461, + "grad_norm": 1.5011304692640992, + "learning_rate": 7.928821289627429e-05, + "loss": 0.9085, + "step": 1700 + }, + { + "epoch": 0.771078875793291, + "grad_norm": 2.129936947091731, + "learning_rate": 7.92858334798246e-05, + "loss": 0.9058, + "step": 1701 + }, + { + "epoch": 0.771532184950136, + "grad_norm": 1.7852326615692842, + "learning_rate": 7.928345012878321e-05, + "loss": 0.9034, + "step": 1702 + }, + { + "epoch": 0.7719854941069809, + "grad_norm": 1.873329223337439, + "learning_rate": 7.928106284338886e-05, + "loss": 0.9131, + "step": 1703 + }, + { + "epoch": 0.772438803263826, + "grad_norm": 1.5280991339987988, + "learning_rate": 7.927867162388062e-05, + "loss": 0.9191, + "step": 1704 + }, + { + "epoch": 0.7728921124206709, + "grad_norm": 2.0331663461113405, + "learning_rate": 7.927627647049796e-05, + "loss": 0.913, + "step": 1705 + }, + { + "epoch": 0.7733454215775158, + "grad_norm": 1.679945163799018, + "learning_rate": 7.92738773834808e-05, + "loss": 0.9296, + "step": 1706 + }, + { + "epoch": 0.7737987307343608, + "grad_norm": 1.9947849618158682, + "learning_rate": 7.92714743630694e-05, + "loss": 0.9258, + "step": 1707 + }, + { + "epoch": 0.7742520398912058, + "grad_norm": 1.6150125606886787, + "learning_rate": 7.92690674095044e-05, + "loss": 0.9051, + "step": 1708 + }, + { + "epoch": 0.7747053490480508, + "grad_norm": 1.9619595748543135, + "learning_rate": 7.926665652302691e-05, + "loss": 0.9223, + "step": 1709 + }, + { + "epoch": 0.7751586582048957, + "grad_norm": 1.6067886747898399, + "learning_rate": 7.926424170387837e-05, + "loss": 0.8983, + "step": 1710 + }, + { + "epoch": 0.7756119673617408, + "grad_norm": 1.993624313425568, + "learning_rate": 7.926182295230061e-05, + "loss": 0.9188, + "step": 1711 + }, + { + "epoch": 0.7760652765185857, + "grad_norm": 1.6171927259541257, + "learning_rate": 7.925940026853591e-05, + "loss": 0.9262, + "step": 1712 + }, + { + "epoch": 0.7765185856754306, + "grad_norm": 1.9063007673850898, + "learning_rate": 7.925697365282688e-05, + "loss": 0.9072, + "step": 1713 + }, + { + "epoch": 0.7769718948322756, + "grad_norm": 1.5283610485454961, + "learning_rate": 7.925454310541657e-05, + "loss": 0.919, + "step": 1714 + }, + { + "epoch": 0.7774252039891206, + "grad_norm": 1.9007753105093976, + "learning_rate": 7.925210862654841e-05, + "loss": 0.8952, + "step": 1715 + }, + { + "epoch": 0.7778785131459656, + "grad_norm": 1.4968084693622616, + "learning_rate": 7.924967021646619e-05, + "loss": 0.935, + "step": 1716 + }, + { + "epoch": 0.7783318223028105, + "grad_norm": 1.7585259595766491, + "learning_rate": 7.924722787541416e-05, + "loss": 0.9213, + "step": 1717 + }, + { + "epoch": 0.7787851314596554, + "grad_norm": 1.4663408642318976, + "learning_rate": 7.92447816036369e-05, + "loss": 0.9071, + "step": 1718 + }, + { + "epoch": 0.7792384406165005, + "grad_norm": 1.7614208457813667, + "learning_rate": 7.924233140137944e-05, + "loss": 0.9083, + "step": 1719 + }, + { + "epoch": 0.7796917497733454, + "grad_norm": 1.3948695332698449, + "learning_rate": 7.923987726888715e-05, + "loss": 0.9333, + "step": 1720 + }, + { + "epoch": 0.7801450589301904, + "grad_norm": 1.500249770714973, + "learning_rate": 7.923741920640582e-05, + "loss": 0.9241, + "step": 1721 + }, + { + "epoch": 0.7805983680870353, + "grad_norm": 1.7991159537585926, + "learning_rate": 7.923495721418166e-05, + "loss": 0.9429, + "step": 1722 + }, + { + "epoch": 0.7810516772438804, + "grad_norm": 1.1208894200581272, + "learning_rate": 7.923249129246122e-05, + "loss": 0.9203, + "step": 1723 + }, + { + "epoch": 0.7815049864007253, + "grad_norm": 1.9391706689903134, + "learning_rate": 7.923002144149147e-05, + "loss": 0.9263, + "step": 1724 + }, + { + "epoch": 0.7819582955575702, + "grad_norm": 1.3737541288004789, + "learning_rate": 7.922754766151978e-05, + "loss": 0.9153, + "step": 1725 + }, + { + "epoch": 0.7824116047144152, + "grad_norm": 2.3499996537588013, + "learning_rate": 7.922506995279391e-05, + "loss": 0.9107, + "step": 1726 + }, + { + "epoch": 0.7828649138712602, + "grad_norm": 2.0874228541593634, + "learning_rate": 7.922258831556198e-05, + "loss": 0.9147, + "step": 1727 + }, + { + "epoch": 0.7833182230281052, + "grad_norm": 1.826134864757779, + "learning_rate": 7.922010275007257e-05, + "loss": 0.9115, + "step": 1728 + }, + { + "epoch": 0.7837715321849501, + "grad_norm": 1.4612182341913045, + "learning_rate": 7.921761325657462e-05, + "loss": 0.923, + "step": 1729 + }, + { + "epoch": 0.7842248413417952, + "grad_norm": 2.0753558715009097, + "learning_rate": 7.921511983531743e-05, + "loss": 0.9245, + "step": 1730 + }, + { + "epoch": 0.7846781504986401, + "grad_norm": 1.4895921997661783, + "learning_rate": 7.921262248655072e-05, + "loss": 0.9257, + "step": 1731 + }, + { + "epoch": 0.785131459655485, + "grad_norm": 2.0954247135900634, + "learning_rate": 7.921012121052466e-05, + "loss": 0.9167, + "step": 1732 + }, + { + "epoch": 0.78558476881233, + "grad_norm": 1.7627028888287175, + "learning_rate": 7.92076160074897e-05, + "loss": 0.933, + "step": 1733 + }, + { + "epoch": 0.786038077969175, + "grad_norm": 1.6025091007530117, + "learning_rate": 7.920510687769678e-05, + "loss": 0.9221, + "step": 1734 + }, + { + "epoch": 0.78649138712602, + "grad_norm": 1.2774296962590526, + "learning_rate": 7.920259382139717e-05, + "loss": 0.9243, + "step": 1735 + }, + { + "epoch": 0.7869446962828649, + "grad_norm": 1.759656423599581, + "learning_rate": 7.92000768388426e-05, + "loss": 0.9185, + "step": 1736 + }, + { + "epoch": 0.7873980054397098, + "grad_norm": 1.1061421552328816, + "learning_rate": 7.91975559302851e-05, + "loss": 0.9239, + "step": 1737 + }, + { + "epoch": 0.7878513145965549, + "grad_norm": 2.2393761230187113, + "learning_rate": 7.919503109597718e-05, + "loss": 0.9352, + "step": 1738 + }, + { + "epoch": 0.7883046237533998, + "grad_norm": 1.8128884521216173, + "learning_rate": 7.919250233617172e-05, + "loss": 0.9255, + "step": 1739 + }, + { + "epoch": 0.7887579329102448, + "grad_norm": 1.6001121445597892, + "learning_rate": 7.918996965112195e-05, + "loss": 0.9098, + "step": 1740 + }, + { + "epoch": 0.7892112420670897, + "grad_norm": 1.452754779080375, + "learning_rate": 7.918743304108155e-05, + "loss": 0.9143, + "step": 1741 + }, + { + "epoch": 0.7896645512239348, + "grad_norm": 1.6136122025580508, + "learning_rate": 7.918489250630456e-05, + "loss": 0.9097, + "step": 1742 + }, + { + "epoch": 0.7901178603807797, + "grad_norm": 1.3009597747442827, + "learning_rate": 7.918234804704542e-05, + "loss": 0.9206, + "step": 1743 + }, + { + "epoch": 0.7905711695376246, + "grad_norm": 1.673391552595416, + "learning_rate": 7.917979966355897e-05, + "loss": 0.9245, + "step": 1744 + }, + { + "epoch": 0.7910244786944697, + "grad_norm": 1.272014424618839, + "learning_rate": 7.917724735610045e-05, + "loss": 0.9101, + "step": 1745 + }, + { + "epoch": 0.7914777878513146, + "grad_norm": 1.7720327122892794, + "learning_rate": 7.917469112492545e-05, + "loss": 0.912, + "step": 1746 + }, + { + "epoch": 0.7919310970081596, + "grad_norm": 1.4435121693441268, + "learning_rate": 7.917213097029e-05, + "loss": 0.9248, + "step": 1747 + }, + { + "epoch": 0.7923844061650045, + "grad_norm": 1.6110529380554193, + "learning_rate": 7.916956689245052e-05, + "loss": 0.9268, + "step": 1748 + }, + { + "epoch": 0.7928377153218495, + "grad_norm": 1.3536267615481845, + "learning_rate": 7.91669988916638e-05, + "loss": 0.9207, + "step": 1749 + }, + { + "epoch": 0.7932910244786945, + "grad_norm": 1.383196858775192, + "learning_rate": 7.9164426968187e-05, + "loss": 0.8997, + "step": 1750 + }, + { + "epoch": 0.7937443336355394, + "grad_norm": 1.0971948641912048, + "learning_rate": 7.916185112227777e-05, + "loss": 0.9193, + "step": 1751 + }, + { + "epoch": 0.7941976427923844, + "grad_norm": 1.7357797790911682, + "learning_rate": 7.915927135419406e-05, + "loss": 0.9225, + "step": 1752 + }, + { + "epoch": 0.7946509519492294, + "grad_norm": 1.4461152239869444, + "learning_rate": 7.915668766419421e-05, + "loss": 0.9312, + "step": 1753 + }, + { + "epoch": 0.7951042611060744, + "grad_norm": 1.5577993233120675, + "learning_rate": 7.915410005253704e-05, + "loss": 0.9117, + "step": 1754 + }, + { + "epoch": 0.7955575702629193, + "grad_norm": 1.5606338547077045, + "learning_rate": 7.915150851948165e-05, + "loss": 0.9398, + "step": 1755 + }, + { + "epoch": 0.7960108794197642, + "grad_norm": 1.1999519249466981, + "learning_rate": 7.914891306528763e-05, + "loss": 0.9257, + "step": 1756 + }, + { + "epoch": 0.7964641885766093, + "grad_norm": 1.108417109314915, + "learning_rate": 7.914631369021491e-05, + "loss": 0.9372, + "step": 1757 + }, + { + "epoch": 0.7969174977334542, + "grad_norm": 1.227350708121383, + "learning_rate": 7.914371039452383e-05, + "loss": 0.9009, + "step": 1758 + }, + { + "epoch": 0.7973708068902992, + "grad_norm": 0.9409253492616543, + "learning_rate": 7.91411031784751e-05, + "loss": 0.9233, + "step": 1759 + }, + { + "epoch": 0.7978241160471442, + "grad_norm": 1.3499533854938484, + "learning_rate": 7.913849204232988e-05, + "loss": 0.9202, + "step": 1760 + }, + { + "epoch": 0.7982774252039891, + "grad_norm": 1.129686996540689, + "learning_rate": 7.913587698634962e-05, + "loss": 0.9275, + "step": 1761 + }, + { + "epoch": 0.7987307343608341, + "grad_norm": 1.2648613596190044, + "learning_rate": 7.913325801079628e-05, + "loss": 0.9288, + "step": 1762 + }, + { + "epoch": 0.799184043517679, + "grad_norm": 0.9820708876078165, + "learning_rate": 7.913063511593215e-05, + "loss": 0.9091, + "step": 1763 + }, + { + "epoch": 0.7996373526745241, + "grad_norm": 1.891956870126177, + "learning_rate": 7.91280083020199e-05, + "loss": 0.9193, + "step": 1764 + }, + { + "epoch": 0.800090661831369, + "grad_norm": 1.5204951242229794, + "learning_rate": 7.912537756932262e-05, + "loss": 0.9433, + "step": 1765 + }, + { + "epoch": 0.800543970988214, + "grad_norm": 1.473056311902333, + "learning_rate": 7.912274291810379e-05, + "loss": 0.9237, + "step": 1766 + }, + { + "epoch": 0.8009972801450589, + "grad_norm": 1.3649679161830286, + "learning_rate": 7.912010434862729e-05, + "loss": 0.9151, + "step": 1767 + }, + { + "epoch": 0.8014505893019039, + "grad_norm": 0.9100209246582931, + "learning_rate": 7.911746186115735e-05, + "loss": 0.9066, + "step": 1768 + }, + { + "epoch": 0.8019038984587489, + "grad_norm": 1.1790276467554137, + "learning_rate": 7.911481545595865e-05, + "loss": 0.922, + "step": 1769 + }, + { + "epoch": 0.8023572076155938, + "grad_norm": 0.7735552370711257, + "learning_rate": 7.911216513329621e-05, + "loss": 0.9249, + "step": 1770 + }, + { + "epoch": 0.8028105167724388, + "grad_norm": 0.9378161241453069, + "learning_rate": 7.91095108934355e-05, + "loss": 0.9114, + "step": 1771 + }, + { + "epoch": 0.8032638259292838, + "grad_norm": 0.840458219646737, + "learning_rate": 7.910685273664232e-05, + "loss": 0.9098, + "step": 1772 + }, + { + "epoch": 0.8037171350861287, + "grad_norm": 1.6211348697595427, + "learning_rate": 7.910419066318292e-05, + "loss": 0.9317, + "step": 1773 + }, + { + "epoch": 0.8041704442429737, + "grad_norm": 1.6020391938474858, + "learning_rate": 7.91015246733239e-05, + "loss": 0.9177, + "step": 1774 + }, + { + "epoch": 0.8046237533998187, + "grad_norm": 2.1069019058471468, + "learning_rate": 7.909885476733227e-05, + "loss": 0.9432, + "step": 1775 + }, + { + "epoch": 0.8050770625566637, + "grad_norm": 1.0992963962272102, + "learning_rate": 7.90961809454754e-05, + "loss": 0.9453, + "step": 1776 + }, + { + "epoch": 0.8055303717135086, + "grad_norm": 1.3066059253728206, + "learning_rate": 7.909350320802112e-05, + "loss": 0.9475, + "step": 1777 + }, + { + "epoch": 0.8059836808703535, + "grad_norm": 1.676447767810327, + "learning_rate": 7.909082155523761e-05, + "loss": 0.9423, + "step": 1778 + }, + { + "epoch": 0.8064369900271986, + "grad_norm": 0.9808954101544356, + "learning_rate": 7.908813598739344e-05, + "loss": 0.9176, + "step": 1779 + }, + { + "epoch": 0.8068902991840435, + "grad_norm": 1.7822462719515302, + "learning_rate": 7.908544650475756e-05, + "loss": 0.9321, + "step": 1780 + }, + { + "epoch": 0.8073436083408885, + "grad_norm": 1.0642349456647628, + "learning_rate": 7.908275310759934e-05, + "loss": 0.9363, + "step": 1781 + }, + { + "epoch": 0.8077969174977334, + "grad_norm": 1.6089464432319618, + "learning_rate": 7.908005579618855e-05, + "loss": 0.9492, + "step": 1782 + }, + { + "epoch": 0.8082502266545785, + "grad_norm": 1.3896456894488163, + "learning_rate": 7.90773545707953e-05, + "loss": 0.9478, + "step": 1783 + }, + { + "epoch": 0.8087035358114234, + "grad_norm": 1.4608198950446802, + "learning_rate": 7.907464943169014e-05, + "loss": 0.9286, + "step": 1784 + }, + { + "epoch": 0.8091568449682683, + "grad_norm": 1.6393990651129717, + "learning_rate": 7.907194037914402e-05, + "loss": 0.9334, + "step": 1785 + }, + { + "epoch": 0.8096101541251133, + "grad_norm": 1.3680630152650128, + "learning_rate": 7.906922741342823e-05, + "loss": 0.9569, + "step": 1786 + }, + { + "epoch": 0.8100634632819583, + "grad_norm": 1.3451944860453235, + "learning_rate": 7.906651053481449e-05, + "loss": 0.9488, + "step": 1787 + }, + { + "epoch": 0.8105167724388033, + "grad_norm": 1.3761097448645225, + "learning_rate": 7.906378974357491e-05, + "loss": 0.9271, + "step": 1788 + }, + { + "epoch": 0.8109700815956482, + "grad_norm": 1.0354138815239464, + "learning_rate": 7.9061065039982e-05, + "loss": 0.9319, + "step": 1789 + }, + { + "epoch": 0.8114233907524931, + "grad_norm": 1.7840426016503628, + "learning_rate": 7.905833642430859e-05, + "loss": 0.9201, + "step": 1790 + }, + { + "epoch": 0.8118766999093382, + "grad_norm": 1.0991144932865402, + "learning_rate": 7.905560389682802e-05, + "loss": 0.9422, + "step": 1791 + }, + { + "epoch": 0.8123300090661831, + "grad_norm": 1.7410019219364492, + "learning_rate": 7.905286745781393e-05, + "loss": 0.9333, + "step": 1792 + }, + { + "epoch": 0.8127833182230281, + "grad_norm": 1.326663590982856, + "learning_rate": 7.905012710754039e-05, + "loss": 0.9233, + "step": 1793 + }, + { + "epoch": 0.8132366273798731, + "grad_norm": 1.4115634050808967, + "learning_rate": 7.904738284628186e-05, + "loss": 0.9284, + "step": 1794 + }, + { + "epoch": 0.8136899365367181, + "grad_norm": 0.9962439092830639, + "learning_rate": 7.904463467431318e-05, + "loss": 0.9295, + "step": 1795 + }, + { + "epoch": 0.814143245693563, + "grad_norm": 1.381098265914572, + "learning_rate": 7.904188259190958e-05, + "loss": 0.9229, + "step": 1796 + }, + { + "epoch": 0.8145965548504079, + "grad_norm": 0.9978437690135863, + "learning_rate": 7.90391265993467e-05, + "loss": 0.915, + "step": 1797 + }, + { + "epoch": 0.815049864007253, + "grad_norm": 1.4862059807082395, + "learning_rate": 7.903636669690055e-05, + "loss": 0.9314, + "step": 1798 + }, + { + "epoch": 0.8155031731640979, + "grad_norm": 1.1144432734724155, + "learning_rate": 7.903360288484757e-05, + "loss": 0.9241, + "step": 1799 + }, + { + "epoch": 0.8159564823209429, + "grad_norm": 1.2328494633110572, + "learning_rate": 7.903083516346453e-05, + "loss": 0.9214, + "step": 1800 + }, + { + "epoch": 0.8164097914777878, + "grad_norm": 1.1680709997801715, + "learning_rate": 7.902806353302864e-05, + "loss": 0.9151, + "step": 1801 + }, + { + "epoch": 0.8168631006346329, + "grad_norm": 1.330300591677882, + "learning_rate": 7.902528799381748e-05, + "loss": 0.9247, + "step": 1802 + }, + { + "epoch": 0.8173164097914778, + "grad_norm": 1.1634559200739671, + "learning_rate": 7.902250854610904e-05, + "loss": 0.9133, + "step": 1803 + }, + { + "epoch": 0.8177697189483227, + "grad_norm": 1.1629904041348953, + "learning_rate": 7.901972519018168e-05, + "loss": 0.9276, + "step": 1804 + }, + { + "epoch": 0.8182230281051677, + "grad_norm": 0.922258462701729, + "learning_rate": 7.901693792631417e-05, + "loss": 0.9182, + "step": 1805 + }, + { + "epoch": 0.8186763372620127, + "grad_norm": 1.4945601542524378, + "learning_rate": 7.901414675478566e-05, + "loss": 0.9068, + "step": 1806 + }, + { + "epoch": 0.8191296464188577, + "grad_norm": 1.1690537061761888, + "learning_rate": 7.90113516758757e-05, + "loss": 0.9325, + "step": 1807 + }, + { + "epoch": 0.8195829555757026, + "grad_norm": 0.9259118042003727, + "learning_rate": 7.90085526898642e-05, + "loss": 0.9118, + "step": 1808 + }, + { + "epoch": 0.8200362647325476, + "grad_norm": 1.073952677083576, + "learning_rate": 7.90057497970315e-05, + "loss": 0.9234, + "step": 1809 + }, + { + "epoch": 0.8204895738893926, + "grad_norm": 1.0606122751880336, + "learning_rate": 7.900294299765835e-05, + "loss": 0.922, + "step": 1810 + }, + { + "epoch": 0.8209428830462375, + "grad_norm": 0.8178404470192219, + "learning_rate": 7.90001322920258e-05, + "loss": 0.9353, + "step": 1811 + }, + { + "epoch": 0.8213961922030825, + "grad_norm": 1.0864148207638016, + "learning_rate": 7.89973176804154e-05, + "loss": 0.9043, + "step": 1812 + }, + { + "epoch": 0.8218495013599275, + "grad_norm": 1.396013331016924, + "learning_rate": 7.899449916310902e-05, + "loss": 0.9179, + "step": 1813 + }, + { + "epoch": 0.8223028105167725, + "grad_norm": 0.903960400302424, + "learning_rate": 7.899167674038895e-05, + "loss": 0.921, + "step": 1814 + }, + { + "epoch": 0.8227561196736174, + "grad_norm": 1.1104747982865313, + "learning_rate": 7.898885041253784e-05, + "loss": 0.9341, + "step": 1815 + }, + { + "epoch": 0.8232094288304623, + "grad_norm": 1.546051520835061, + "learning_rate": 7.89860201798388e-05, + "loss": 0.9156, + "step": 1816 + }, + { + "epoch": 0.8236627379873074, + "grad_norm": 1.056296595397706, + "learning_rate": 7.898318604257525e-05, + "loss": 0.9224, + "step": 1817 + }, + { + "epoch": 0.8241160471441523, + "grad_norm": 1.1857692078339375, + "learning_rate": 7.898034800103105e-05, + "loss": 0.9347, + "step": 1818 + }, + { + "epoch": 0.8245693563009973, + "grad_norm": 0.9431938504785246, + "learning_rate": 7.897750605549044e-05, + "loss": 0.9094, + "step": 1819 + }, + { + "epoch": 0.8250226654578422, + "grad_norm": 1.00053041695227, + "learning_rate": 7.897466020623803e-05, + "loss": 0.9049, + "step": 1820 + }, + { + "epoch": 0.8254759746146872, + "grad_norm": 1.7200724950872759, + "learning_rate": 7.897181045355888e-05, + "loss": 0.9326, + "step": 1821 + }, + { + "epoch": 0.8259292837715322, + "grad_norm": 0.8190955956053115, + "learning_rate": 7.896895679773836e-05, + "loss": 0.929, + "step": 1822 + }, + { + "epoch": 0.8263825929283771, + "grad_norm": 1.2907631253169443, + "learning_rate": 7.89660992390623e-05, + "loss": 0.93, + "step": 1823 + }, + { + "epoch": 0.8268359020852222, + "grad_norm": 1.3315105914287373, + "learning_rate": 7.896323777781687e-05, + "loss": 0.904, + "step": 1824 + }, + { + "epoch": 0.8272892112420671, + "grad_norm": 1.5573466864762546, + "learning_rate": 7.896037241428867e-05, + "loss": 0.9286, + "step": 1825 + }, + { + "epoch": 0.827742520398912, + "grad_norm": 0.909848425851475, + "learning_rate": 7.895750314876467e-05, + "loss": 0.9295, + "step": 1826 + }, + { + "epoch": 0.828195829555757, + "grad_norm": 1.095915624457976, + "learning_rate": 7.895462998153225e-05, + "loss": 0.9358, + "step": 1827 + }, + { + "epoch": 0.828649138712602, + "grad_norm": 1.2893677561679044, + "learning_rate": 7.895175291287913e-05, + "loss": 0.9237, + "step": 1828 + }, + { + "epoch": 0.829102447869447, + "grad_norm": 1.4999765388227546, + "learning_rate": 7.894887194309348e-05, + "loss": 0.9015, + "step": 1829 + }, + { + "epoch": 0.8295557570262919, + "grad_norm": 1.1911744158471365, + "learning_rate": 7.894598707246386e-05, + "loss": 0.9192, + "step": 1830 + }, + { + "epoch": 0.8300090661831369, + "grad_norm": 1.0728255119231427, + "learning_rate": 7.894309830127915e-05, + "loss": 0.9286, + "step": 1831 + }, + { + "epoch": 0.8304623753399819, + "grad_norm": 1.0087230156208657, + "learning_rate": 7.894020562982872e-05, + "loss": 0.934, + "step": 1832 + }, + { + "epoch": 0.8309156844968268, + "grad_norm": 1.522594314527388, + "learning_rate": 7.893730905840222e-05, + "loss": 0.9018, + "step": 1833 + }, + { + "epoch": 0.8313689936536718, + "grad_norm": 0.7973183538871538, + "learning_rate": 7.89344085872898e-05, + "loss": 0.927, + "step": 1834 + }, + { + "epoch": 0.8318223028105167, + "grad_norm": 1.855544936511742, + "learning_rate": 7.893150421678194e-05, + "loss": 0.927, + "step": 1835 + }, + { + "epoch": 0.8322756119673618, + "grad_norm": 0.8056066842071624, + "learning_rate": 7.89285959471695e-05, + "loss": 0.9172, + "step": 1836 + }, + { + "epoch": 0.8327289211242067, + "grad_norm": 1.9025083726328658, + "learning_rate": 7.892568377874377e-05, + "loss": 0.9146, + "step": 1837 + }, + { + "epoch": 0.8331822302810517, + "grad_norm": 1.2507127246423972, + "learning_rate": 7.892276771179642e-05, + "loss": 0.9508, + "step": 1838 + }, + { + "epoch": 0.8336355394378967, + "grad_norm": 2.2560883551688007, + "learning_rate": 7.891984774661948e-05, + "loss": 0.9015, + "step": 1839 + }, + { + "epoch": 0.8340888485947416, + "grad_norm": 2.0455518330722375, + "learning_rate": 7.891692388350541e-05, + "loss": 0.9422, + "step": 1840 + }, + { + "epoch": 0.8345421577515866, + "grad_norm": 1.372323999919083, + "learning_rate": 7.891399612274704e-05, + "loss": 0.9311, + "step": 1841 + }, + { + "epoch": 0.8349954669084315, + "grad_norm": 1.4975814754658494, + "learning_rate": 7.89110644646376e-05, + "loss": 0.9196, + "step": 1842 + }, + { + "epoch": 0.8354487760652766, + "grad_norm": 1.233209061589469, + "learning_rate": 7.890812890947069e-05, + "loss": 0.9191, + "step": 1843 + }, + { + "epoch": 0.8359020852221215, + "grad_norm": 1.229898355976964, + "learning_rate": 7.89051894575403e-05, + "loss": 0.9145, + "step": 1844 + }, + { + "epoch": 0.8363553943789664, + "grad_norm": 1.1743737632891098, + "learning_rate": 7.890224610914088e-05, + "loss": 0.9159, + "step": 1845 + }, + { + "epoch": 0.8368087035358114, + "grad_norm": 1.256231992133992, + "learning_rate": 7.889929886456716e-05, + "loss": 0.9212, + "step": 1846 + }, + { + "epoch": 0.8372620126926564, + "grad_norm": 1.2181987229451725, + "learning_rate": 7.889634772411434e-05, + "loss": 0.9256, + "step": 1847 + }, + { + "epoch": 0.8377153218495014, + "grad_norm": 1.1724636515352895, + "learning_rate": 7.889339268807798e-05, + "loss": 0.9204, + "step": 1848 + }, + { + "epoch": 0.8381686310063463, + "grad_norm": 1.5135944704346873, + "learning_rate": 7.889043375675404e-05, + "loss": 0.9264, + "step": 1849 + }, + { + "epoch": 0.8386219401631912, + "grad_norm": 1.0930376508916606, + "learning_rate": 7.888747093043886e-05, + "loss": 0.9148, + "step": 1850 + }, + { + "epoch": 0.8390752493200363, + "grad_norm": 1.1300623826826586, + "learning_rate": 7.888450420942917e-05, + "loss": 0.949, + "step": 1851 + }, + { + "epoch": 0.8395285584768812, + "grad_norm": 1.0776064343592024, + "learning_rate": 7.888153359402212e-05, + "loss": 0.9133, + "step": 1852 + }, + { + "epoch": 0.8399818676337262, + "grad_norm": 1.169310277838133, + "learning_rate": 7.887855908451519e-05, + "loss": 0.9187, + "step": 1853 + }, + { + "epoch": 0.8404351767905711, + "grad_norm": 1.3006570776997852, + "learning_rate": 7.887558068120633e-05, + "loss": 0.9509, + "step": 1854 + }, + { + "epoch": 0.8408884859474162, + "grad_norm": 1.2783096636890436, + "learning_rate": 7.887259838439379e-05, + "loss": 0.8994, + "step": 1855 + }, + { + "epoch": 0.8413417951042611, + "grad_norm": 1.3633806409595846, + "learning_rate": 7.88696121943763e-05, + "loss": 0.9221, + "step": 1856 + }, + { + "epoch": 0.841795104261106, + "grad_norm": 1.1454200097862939, + "learning_rate": 7.88666221114529e-05, + "loss": 0.909, + "step": 1857 + }, + { + "epoch": 0.8422484134179511, + "grad_norm": 0.792917901865371, + "learning_rate": 7.886362813592306e-05, + "loss": 0.8995, + "step": 1858 + }, + { + "epoch": 0.842701722574796, + "grad_norm": 0.8726086814655074, + "learning_rate": 7.886063026808665e-05, + "loss": 0.9291, + "step": 1859 + }, + { + "epoch": 0.843155031731641, + "grad_norm": 1.148682263673849, + "learning_rate": 7.885762850824391e-05, + "loss": 0.9091, + "step": 1860 + }, + { + "epoch": 0.8436083408884859, + "grad_norm": 1.713108628393308, + "learning_rate": 7.885462285669547e-05, + "loss": 0.9482, + "step": 1861 + }, + { + "epoch": 0.844061650045331, + "grad_norm": 0.7722409650883388, + "learning_rate": 7.885161331374237e-05, + "loss": 0.9278, + "step": 1862 + }, + { + "epoch": 0.8445149592021759, + "grad_norm": 1.1057762932736723, + "learning_rate": 7.884859987968602e-05, + "loss": 0.9402, + "step": 1863 + }, + { + "epoch": 0.8449682683590208, + "grad_norm": 1.4314360738551417, + "learning_rate": 7.88455825548282e-05, + "loss": 0.9309, + "step": 1864 + }, + { + "epoch": 0.8454215775158658, + "grad_norm": 0.6300549431830416, + "learning_rate": 7.884256133947114e-05, + "loss": 0.9189, + "step": 1865 + }, + { + "epoch": 0.8458748866727108, + "grad_norm": 1.1571207443230156, + "learning_rate": 7.883953623391739e-05, + "loss": 0.9275, + "step": 1866 + }, + { + "epoch": 0.8463281958295558, + "grad_norm": 1.1712971078196217, + "learning_rate": 7.883650723846995e-05, + "loss": 0.914, + "step": 1867 + }, + { + "epoch": 0.8467815049864007, + "grad_norm": 0.9520714310253804, + "learning_rate": 7.883347435343215e-05, + "loss": 0.9196, + "step": 1868 + }, + { + "epoch": 0.8472348141432456, + "grad_norm": 1.3441081009257478, + "learning_rate": 7.883043757910778e-05, + "loss": 0.9054, + "step": 1869 + }, + { + "epoch": 0.8476881233000907, + "grad_norm": 1.0893051796953523, + "learning_rate": 7.882739691580098e-05, + "loss": 0.9284, + "step": 1870 + }, + { + "epoch": 0.8481414324569356, + "grad_norm": 1.4188753979500799, + "learning_rate": 7.882435236381626e-05, + "loss": 0.8936, + "step": 1871 + }, + { + "epoch": 0.8485947416137806, + "grad_norm": 1.056036314234669, + "learning_rate": 7.882130392345853e-05, + "loss": 0.9049, + "step": 1872 + }, + { + "epoch": 0.8490480507706256, + "grad_norm": 2.0425290191911376, + "learning_rate": 7.881825159503314e-05, + "loss": 0.9247, + "step": 1873 + }, + { + "epoch": 0.8495013599274706, + "grad_norm": 0.9805987720396707, + "learning_rate": 7.881519537884575e-05, + "loss": 0.9249, + "step": 1874 + }, + { + "epoch": 0.8499546690843155, + "grad_norm": 2.558584057567685, + "learning_rate": 7.881213527520247e-05, + "loss": 0.9321, + "step": 1875 + }, + { + "epoch": 0.8504079782411604, + "grad_norm": 2.151800476607534, + "learning_rate": 7.880907128440978e-05, + "loss": 0.9407, + "step": 1876 + }, + { + "epoch": 0.8508612873980055, + "grad_norm": 1.7239138527213016, + "learning_rate": 7.880600340677454e-05, + "loss": 0.9219, + "step": 1877 + }, + { + "epoch": 0.8513145965548504, + "grad_norm": 1.6079557910022217, + "learning_rate": 7.880293164260401e-05, + "loss": 0.896, + "step": 1878 + }, + { + "epoch": 0.8517679057116954, + "grad_norm": 1.658867651037123, + "learning_rate": 7.879985599220584e-05, + "loss": 0.9122, + "step": 1879 + }, + { + "epoch": 0.8522212148685403, + "grad_norm": 1.5748051908391758, + "learning_rate": 7.879677645588807e-05, + "loss": 0.9185, + "step": 1880 + }, + { + "epoch": 0.8526745240253853, + "grad_norm": 1.3867307016838302, + "learning_rate": 7.879369303395911e-05, + "loss": 0.9315, + "step": 1881 + }, + { + "epoch": 0.8531278331822303, + "grad_norm": 1.4805550659807891, + "learning_rate": 7.879060572672778e-05, + "loss": 0.9154, + "step": 1882 + }, + { + "epoch": 0.8535811423390752, + "grad_norm": 1.0552828322901946, + "learning_rate": 7.878751453450328e-05, + "loss": 0.9263, + "step": 1883 + }, + { + "epoch": 0.8540344514959202, + "grad_norm": 2.0524396724954443, + "learning_rate": 7.87844194575952e-05, + "loss": 0.928, + "step": 1884 + }, + { + "epoch": 0.8544877606527652, + "grad_norm": 1.519824649737183, + "learning_rate": 7.878132049631353e-05, + "loss": 0.9371, + "step": 1885 + }, + { + "epoch": 0.8549410698096102, + "grad_norm": 2.1640222226736245, + "learning_rate": 7.877821765096864e-05, + "loss": 0.9399, + "step": 1886 + }, + { + "epoch": 0.8553943789664551, + "grad_norm": 1.9649005494130922, + "learning_rate": 7.877511092187127e-05, + "loss": 0.9313, + "step": 1887 + }, + { + "epoch": 0.8558476881233001, + "grad_norm": 1.950310635034749, + "learning_rate": 7.877200030933261e-05, + "loss": 0.9175, + "step": 1888 + }, + { + "epoch": 0.8563009972801451, + "grad_norm": 1.9368759888736782, + "learning_rate": 7.876888581366416e-05, + "loss": 0.9236, + "step": 1889 + }, + { + "epoch": 0.85675430643699, + "grad_norm": 1.659245595604548, + "learning_rate": 7.876576743517785e-05, + "loss": 0.9035, + "step": 1890 + }, + { + "epoch": 0.857207615593835, + "grad_norm": 1.6672005733032513, + "learning_rate": 7.8762645174186e-05, + "loss": 0.9056, + "step": 1891 + }, + { + "epoch": 0.85766092475068, + "grad_norm": 1.5751160697561608, + "learning_rate": 7.875951903100132e-05, + "loss": 0.9195, + "step": 1892 + }, + { + "epoch": 0.858114233907525, + "grad_norm": 1.4796178311390398, + "learning_rate": 7.87563890059369e-05, + "loss": 0.9312, + "step": 1893 + }, + { + "epoch": 0.8585675430643699, + "grad_norm": 1.6292273630805565, + "learning_rate": 7.875325509930622e-05, + "loss": 0.9215, + "step": 1894 + }, + { + "epoch": 0.8590208522212148, + "grad_norm": 0.73607429564796, + "learning_rate": 7.875011731142313e-05, + "loss": 0.8984, + "step": 1895 + }, + { + "epoch": 0.8594741613780599, + "grad_norm": 1.6700320955660077, + "learning_rate": 7.874697564260191e-05, + "loss": 0.942, + "step": 1896 + }, + { + "epoch": 0.8599274705349048, + "grad_norm": 1.0564377631010868, + "learning_rate": 7.874383009315722e-05, + "loss": 0.9428, + "step": 1897 + }, + { + "epoch": 0.8603807796917498, + "grad_norm": 1.5480957520865852, + "learning_rate": 7.874068066340407e-05, + "loss": 0.9147, + "step": 1898 + }, + { + "epoch": 0.8608340888485947, + "grad_norm": 0.8970282649964363, + "learning_rate": 7.87375273536579e-05, + "loss": 0.9235, + "step": 1899 + }, + { + "epoch": 0.8612873980054397, + "grad_norm": 1.4433493918516522, + "learning_rate": 7.87343701642345e-05, + "loss": 0.9056, + "step": 1900 + }, + { + "epoch": 0.8617407071622847, + "grad_norm": 1.243568067087514, + "learning_rate": 7.873120909545012e-05, + "loss": 0.9286, + "step": 1901 + }, + { + "epoch": 0.8621940163191296, + "grad_norm": 1.173756480819487, + "learning_rate": 7.872804414762129e-05, + "loss": 0.9262, + "step": 1902 + }, + { + "epoch": 0.8626473254759747, + "grad_norm": 1.4821238794615876, + "learning_rate": 7.872487532106505e-05, + "loss": 0.9174, + "step": 1903 + }, + { + "epoch": 0.8631006346328196, + "grad_norm": 0.8928453738158387, + "learning_rate": 7.872170261609872e-05, + "loss": 0.9331, + "step": 1904 + }, + { + "epoch": 0.8635539437896645, + "grad_norm": 1.930574366895751, + "learning_rate": 7.871852603304007e-05, + "loss": 0.9371, + "step": 1905 + }, + { + "epoch": 0.8640072529465095, + "grad_norm": 1.3834056912924555, + "learning_rate": 7.871534557220725e-05, + "loss": 0.9259, + "step": 1906 + }, + { + "epoch": 0.8644605621033545, + "grad_norm": 1.6355135580826483, + "learning_rate": 7.871216123391878e-05, + "loss": 0.9242, + "step": 1907 + }, + { + "epoch": 0.8649138712601995, + "grad_norm": 1.5058385576921285, + "learning_rate": 7.87089730184936e-05, + "loss": 0.9287, + "step": 1908 + }, + { + "epoch": 0.8653671804170444, + "grad_norm": 1.3462543545052719, + "learning_rate": 7.8705780926251e-05, + "loss": 0.9111, + "step": 1909 + }, + { + "epoch": 0.8658204895738894, + "grad_norm": 1.0877757397201948, + "learning_rate": 7.87025849575107e-05, + "loss": 0.926, + "step": 1910 + }, + { + "epoch": 0.8662737987307344, + "grad_norm": 1.3593626934644014, + "learning_rate": 7.869938511259276e-05, + "loss": 0.92, + "step": 1911 + }, + { + "epoch": 0.8667271078875793, + "grad_norm": 1.1836973947487357, + "learning_rate": 7.869618139181766e-05, + "loss": 0.9158, + "step": 1912 + }, + { + "epoch": 0.8671804170444243, + "grad_norm": 1.5625996137583116, + "learning_rate": 7.869297379550628e-05, + "loss": 0.908, + "step": 1913 + }, + { + "epoch": 0.8676337262012692, + "grad_norm": 1.0627189959446726, + "learning_rate": 7.868976232397985e-05, + "loss": 0.9415, + "step": 1914 + }, + { + "epoch": 0.8680870353581143, + "grad_norm": 1.6716850933788188, + "learning_rate": 7.868654697756003e-05, + "loss": 0.9172, + "step": 1915 + }, + { + "epoch": 0.8685403445149592, + "grad_norm": 1.4701949106090457, + "learning_rate": 7.868332775656883e-05, + "loss": 0.9316, + "step": 1916 + }, + { + "epoch": 0.8689936536718041, + "grad_norm": 1.038897792998551, + "learning_rate": 7.868010466132865e-05, + "loss": 0.8974, + "step": 1917 + }, + { + "epoch": 0.8694469628286491, + "grad_norm": 1.0459557711153045, + "learning_rate": 7.867687769216233e-05, + "loss": 0.9142, + "step": 1918 + }, + { + "epoch": 0.8699002719854941, + "grad_norm": 1.416156212127103, + "learning_rate": 7.867364684939303e-05, + "loss": 0.9207, + "step": 1919 + }, + { + "epoch": 0.8703535811423391, + "grad_norm": 0.9907253177086155, + "learning_rate": 7.867041213334433e-05, + "loss": 0.9206, + "step": 1920 + }, + { + "epoch": 0.870806890299184, + "grad_norm": 1.6835315667242408, + "learning_rate": 7.86671735443402e-05, + "loss": 0.9275, + "step": 1921 + }, + { + "epoch": 0.8712601994560291, + "grad_norm": 1.4598022631944116, + "learning_rate": 7.866393108270502e-05, + "loss": 0.9039, + "step": 1922 + }, + { + "epoch": 0.871713508612874, + "grad_norm": 1.3164116509609889, + "learning_rate": 7.86606847487635e-05, + "loss": 0.9024, + "step": 1923 + }, + { + "epoch": 0.8721668177697189, + "grad_norm": 1.3453272903569233, + "learning_rate": 7.865743454284077e-05, + "loss": 0.91, + "step": 1924 + }, + { + "epoch": 0.8726201269265639, + "grad_norm": 0.9856561617296831, + "learning_rate": 7.865418046526239e-05, + "loss": 0.9117, + "step": 1925 + }, + { + "epoch": 0.8730734360834089, + "grad_norm": 1.2255954495451855, + "learning_rate": 7.865092251635421e-05, + "loss": 0.9191, + "step": 1926 + }, + { + "epoch": 0.8735267452402539, + "grad_norm": 0.9522296793462942, + "learning_rate": 7.864766069644255e-05, + "loss": 0.9114, + "step": 1927 + }, + { + "epoch": 0.8739800543970988, + "grad_norm": 1.0007976348293464, + "learning_rate": 7.864439500585409e-05, + "loss": 0.9072, + "step": 1928 + }, + { + "epoch": 0.8744333635539437, + "grad_norm": 1.593905339012299, + "learning_rate": 7.86411254449159e-05, + "loss": 0.8908, + "step": 1929 + }, + { + "epoch": 0.8748866727107888, + "grad_norm": 0.9323896646646574, + "learning_rate": 7.863785201395543e-05, + "loss": 0.9132, + "step": 1930 + }, + { + "epoch": 0.8753399818676337, + "grad_norm": 1.1368012813161903, + "learning_rate": 7.863457471330052e-05, + "loss": 0.9132, + "step": 1931 + }, + { + "epoch": 0.8757932910244787, + "grad_norm": 1.0172639352303696, + "learning_rate": 7.86312935432794e-05, + "loss": 0.8964, + "step": 1932 + }, + { + "epoch": 0.8762466001813236, + "grad_norm": 1.2784740082318715, + "learning_rate": 7.862800850422072e-05, + "loss": 0.9267, + "step": 1933 + }, + { + "epoch": 0.8766999093381687, + "grad_norm": 1.591670422534571, + "learning_rate": 7.862471959645345e-05, + "loss": 0.9229, + "step": 1934 + }, + { + "epoch": 0.8771532184950136, + "grad_norm": 0.6819382224568834, + "learning_rate": 7.8621426820307e-05, + "loss": 0.8821, + "step": 1935 + }, + { + "epoch": 0.8776065276518585, + "grad_norm": 1.053769190765739, + "learning_rate": 7.861813017611114e-05, + "loss": 0.9235, + "step": 1936 + }, + { + "epoch": 0.8780598368087036, + "grad_norm": 1.6341179947462707, + "learning_rate": 7.861482966419606e-05, + "loss": 0.9178, + "step": 1937 + }, + { + "epoch": 0.8785131459655485, + "grad_norm": 0.9667888648931678, + "learning_rate": 7.861152528489228e-05, + "loss": 0.9115, + "step": 1938 + }, + { + "epoch": 0.8789664551223935, + "grad_norm": 1.5333161777049396, + "learning_rate": 7.860821703853079e-05, + "loss": 0.9404, + "step": 1939 + }, + { + "epoch": 0.8794197642792384, + "grad_norm": 1.1719089114984325, + "learning_rate": 7.86049049254429e-05, + "loss": 0.933, + "step": 1940 + }, + { + "epoch": 0.8798730734360835, + "grad_norm": 1.4785235681664683, + "learning_rate": 7.860158894596031e-05, + "loss": 0.9034, + "step": 1941 + }, + { + "epoch": 0.8803263825929284, + "grad_norm": 1.375099759841513, + "learning_rate": 7.859826910041512e-05, + "loss": 0.9077, + "step": 1942 + }, + { + "epoch": 0.8807796917497733, + "grad_norm": 1.2023429138552386, + "learning_rate": 7.859494538913987e-05, + "loss": 0.9129, + "step": 1943 + }, + { + "epoch": 0.8812330009066183, + "grad_norm": 1.0487767734071967, + "learning_rate": 7.85916178124674e-05, + "loss": 0.9252, + "step": 1944 + }, + { + "epoch": 0.8816863100634633, + "grad_norm": 1.315668702077222, + "learning_rate": 7.858828637073098e-05, + "loss": 0.9331, + "step": 1945 + }, + { + "epoch": 0.8821396192203083, + "grad_norm": 0.8512507988453885, + "learning_rate": 7.858495106426428e-05, + "loss": 0.8915, + "step": 1946 + }, + { + "epoch": 0.8825929283771532, + "grad_norm": 1.2463898176587642, + "learning_rate": 7.858161189340133e-05, + "loss": 0.9493, + "step": 1947 + }, + { + "epoch": 0.8830462375339981, + "grad_norm": 1.046572294355905, + "learning_rate": 7.857826885847657e-05, + "loss": 0.9258, + "step": 1948 + }, + { + "epoch": 0.8834995466908432, + "grad_norm": 0.9816737819465419, + "learning_rate": 7.857492195982479e-05, + "loss": 0.9098, + "step": 1949 + }, + { + "epoch": 0.8839528558476881, + "grad_norm": 1.0349480003304663, + "learning_rate": 7.857157119778122e-05, + "loss": 0.9183, + "step": 1950 + }, + { + "epoch": 0.8844061650045331, + "grad_norm": 0.9675622840423959, + "learning_rate": 7.856821657268142e-05, + "loss": 0.9086, + "step": 1951 + }, + { + "epoch": 0.8848594741613781, + "grad_norm": 1.1068727139880046, + "learning_rate": 7.856485808486139e-05, + "loss": 0.9394, + "step": 1952 + }, + { + "epoch": 0.885312783318223, + "grad_norm": 1.6612026848770702, + "learning_rate": 7.856149573465748e-05, + "loss": 0.9161, + "step": 1953 + }, + { + "epoch": 0.885766092475068, + "grad_norm": 0.8600754329714205, + "learning_rate": 7.855812952240643e-05, + "loss": 0.9185, + "step": 1954 + }, + { + "epoch": 0.8862194016319129, + "grad_norm": 0.9599775104914932, + "learning_rate": 7.855475944844541e-05, + "loss": 0.9034, + "step": 1955 + }, + { + "epoch": 0.886672710788758, + "grad_norm": 0.8665775471465307, + "learning_rate": 7.855138551311191e-05, + "loss": 0.9257, + "step": 1956 + }, + { + "epoch": 0.8871260199456029, + "grad_norm": 0.9915757495132598, + "learning_rate": 7.854800771674385e-05, + "loss": 0.9006, + "step": 1957 + }, + { + "epoch": 0.8875793291024479, + "grad_norm": 1.124512493442088, + "learning_rate": 7.854462605967953e-05, + "loss": 0.9238, + "step": 1958 + }, + { + "epoch": 0.8880326382592928, + "grad_norm": 1.0292277668198335, + "learning_rate": 7.854124054225763e-05, + "loss": 0.9123, + "step": 1959 + }, + { + "epoch": 0.8884859474161378, + "grad_norm": 1.6918327331351273, + "learning_rate": 7.853785116481721e-05, + "loss": 0.9248, + "step": 1960 + }, + { + "epoch": 0.8889392565729828, + "grad_norm": 1.011139398594677, + "learning_rate": 7.853445792769775e-05, + "loss": 0.9295, + "step": 1961 + }, + { + "epoch": 0.8893925657298277, + "grad_norm": 1.5322599696983101, + "learning_rate": 7.853106083123906e-05, + "loss": 0.9036, + "step": 1962 + }, + { + "epoch": 0.8898458748866727, + "grad_norm": 0.7864815261429484, + "learning_rate": 7.852765987578139e-05, + "loss": 0.9174, + "step": 1963 + }, + { + "epoch": 0.8902991840435177, + "grad_norm": 1.7542102992351025, + "learning_rate": 7.852425506166537e-05, + "loss": 0.9031, + "step": 1964 + }, + { + "epoch": 0.8907524932003626, + "grad_norm": 0.9686390603926942, + "learning_rate": 7.852084638923198e-05, + "loss": 0.9362, + "step": 1965 + }, + { + "epoch": 0.8912058023572076, + "grad_norm": 1.5086809646566532, + "learning_rate": 7.851743385882259e-05, + "loss": 0.9296, + "step": 1966 + }, + { + "epoch": 0.8916591115140526, + "grad_norm": 1.1535192938110832, + "learning_rate": 7.851401747077902e-05, + "loss": 0.9233, + "step": 1967 + }, + { + "epoch": 0.8921124206708976, + "grad_norm": 1.4339655942012337, + "learning_rate": 7.851059722544341e-05, + "loss": 0.9133, + "step": 1968 + }, + { + "epoch": 0.8925657298277425, + "grad_norm": 1.4053030276027347, + "learning_rate": 7.85071731231583e-05, + "loss": 0.9258, + "step": 1969 + }, + { + "epoch": 0.8930190389845875, + "grad_norm": 1.5118303735546952, + "learning_rate": 7.850374516426664e-05, + "loss": 0.9179, + "step": 1970 + }, + { + "epoch": 0.8934723481414325, + "grad_norm": 1.4054411787286962, + "learning_rate": 7.850031334911172e-05, + "loss": 0.9226, + "step": 1971 + }, + { + "epoch": 0.8939256572982774, + "grad_norm": 1.4225297470218643, + "learning_rate": 7.849687767803729e-05, + "loss": 0.9288, + "step": 1972 + }, + { + "epoch": 0.8943789664551224, + "grad_norm": 0.979544679670593, + "learning_rate": 7.849343815138743e-05, + "loss": 0.9125, + "step": 1973 + }, + { + "epoch": 0.8948322756119673, + "grad_norm": 1.2215022477361774, + "learning_rate": 7.848999476950658e-05, + "loss": 0.9198, + "step": 1974 + }, + { + "epoch": 0.8952855847688124, + "grad_norm": 1.415951447041961, + "learning_rate": 7.848654753273964e-05, + "loss": 0.9276, + "step": 1975 + }, + { + "epoch": 0.8957388939256573, + "grad_norm": 1.2800237313489038, + "learning_rate": 7.848309644143187e-05, + "loss": 0.9175, + "step": 1976 + }, + { + "epoch": 0.8961922030825022, + "grad_norm": 0.8672779343681286, + "learning_rate": 7.847964149592888e-05, + "loss": 0.945, + "step": 1977 + }, + { + "epoch": 0.8966455122393472, + "grad_norm": 1.0887878880562545, + "learning_rate": 7.84761826965767e-05, + "loss": 0.9167, + "step": 1978 + }, + { + "epoch": 0.8970988213961922, + "grad_norm": 1.206296033443608, + "learning_rate": 7.847272004372175e-05, + "loss": 0.9266, + "step": 1979 + }, + { + "epoch": 0.8975521305530372, + "grad_norm": 1.036993095428557, + "learning_rate": 7.846925353771083e-05, + "loss": 0.9239, + "step": 1980 + }, + { + "epoch": 0.8980054397098821, + "grad_norm": 0.9631618052852938, + "learning_rate": 7.84657831788911e-05, + "loss": 0.9247, + "step": 1981 + }, + { + "epoch": 0.898458748866727, + "grad_norm": 1.6676870877470913, + "learning_rate": 7.846230896761013e-05, + "loss": 0.9261, + "step": 1982 + }, + { + "epoch": 0.8989120580235721, + "grad_norm": 1.110322302655846, + "learning_rate": 7.845883090421587e-05, + "loss": 0.9078, + "step": 1983 + }, + { + "epoch": 0.899365367180417, + "grad_norm": 0.9280638904351795, + "learning_rate": 7.84553489890567e-05, + "loss": 0.9251, + "step": 1984 + }, + { + "epoch": 0.899818676337262, + "grad_norm": 1.191271806473513, + "learning_rate": 7.845186322248127e-05, + "loss": 0.9068, + "step": 1985 + }, + { + "epoch": 0.900271985494107, + "grad_norm": 1.1485611312108646, + "learning_rate": 7.844837360483876e-05, + "loss": 0.901, + "step": 1986 + }, + { + "epoch": 0.900725294650952, + "grad_norm": 1.081102725502325, + "learning_rate": 7.844488013647863e-05, + "loss": 0.9174, + "step": 1987 + }, + { + "epoch": 0.9011786038077969, + "grad_norm": 1.7013648372956993, + "learning_rate": 7.844138281775076e-05, + "loss": 0.913, + "step": 1988 + }, + { + "epoch": 0.9016319129646418, + "grad_norm": 0.8518688363766326, + "learning_rate": 7.843788164900542e-05, + "loss": 0.9236, + "step": 1989 + }, + { + "epoch": 0.9020852221214869, + "grad_norm": 0.9602186964432055, + "learning_rate": 7.843437663059326e-05, + "loss": 0.9257, + "step": 1990 + }, + { + "epoch": 0.9025385312783318, + "grad_norm": 1.1007205064140608, + "learning_rate": 7.843086776286535e-05, + "loss": 0.9005, + "step": 1991 + }, + { + "epoch": 0.9029918404351768, + "grad_norm": 1.5633732150140909, + "learning_rate": 7.842735504617308e-05, + "loss": 0.9149, + "step": 1992 + }, + { + "epoch": 0.9034451495920217, + "grad_norm": 1.0836193387359279, + "learning_rate": 7.842383848086825e-05, + "loss": 0.9182, + "step": 1993 + }, + { + "epoch": 0.9038984587488668, + "grad_norm": 1.31433170520456, + "learning_rate": 7.842031806730308e-05, + "loss": 0.9169, + "step": 1994 + }, + { + "epoch": 0.9043517679057117, + "grad_norm": 0.9113846521936262, + "learning_rate": 7.841679380583015e-05, + "loss": 0.9224, + "step": 1995 + }, + { + "epoch": 0.9048050770625566, + "grad_norm": 1.0190960678703165, + "learning_rate": 7.84132656968024e-05, + "loss": 0.9154, + "step": 1996 + }, + { + "epoch": 0.9052583862194016, + "grad_norm": 1.5887734062362449, + "learning_rate": 7.840973374057321e-05, + "loss": 0.9137, + "step": 1997 + }, + { + "epoch": 0.9057116953762466, + "grad_norm": 1.2529172088740863, + "learning_rate": 7.840619793749629e-05, + "loss": 0.9007, + "step": 1998 + }, + { + "epoch": 0.9061650045330916, + "grad_norm": 1.5400381611178562, + "learning_rate": 7.840265828792577e-05, + "loss": 0.9382, + "step": 1999 + }, + { + "epoch": 0.9066183136899365, + "grad_norm": 1.075439265816313, + "learning_rate": 7.839911479221617e-05, + "loss": 0.9251, + "step": 2000 + }, + { + "epoch": 0.9070716228467816, + "grad_norm": 2.318803572368361, + "learning_rate": 7.839556745072237e-05, + "loss": 0.9076, + "step": 2001 + }, + { + "epoch": 0.9075249320036265, + "grad_norm": 1.7876709606412562, + "learning_rate": 7.839201626379964e-05, + "loss": 0.9012, + "step": 2002 + }, + { + "epoch": 0.9079782411604714, + "grad_norm": 2.0054168572832514, + "learning_rate": 7.838846123180365e-05, + "loss": 0.9187, + "step": 2003 + }, + { + "epoch": 0.9084315503173164, + "grad_norm": 1.689293107845989, + "learning_rate": 7.838490235509046e-05, + "loss": 0.9477, + "step": 2004 + }, + { + "epoch": 0.9088848594741614, + "grad_norm": 1.6622720273573734, + "learning_rate": 7.838133963401646e-05, + "loss": 0.9326, + "step": 2005 + }, + { + "epoch": 0.9093381686310064, + "grad_norm": 1.535687367949534, + "learning_rate": 7.837777306893852e-05, + "loss": 0.899, + "step": 2006 + }, + { + "epoch": 0.9097914777878513, + "grad_norm": 0.967309844911602, + "learning_rate": 7.837420266021381e-05, + "loss": 0.9142, + "step": 2007 + }, + { + "epoch": 0.9102447869446962, + "grad_norm": 1.3293369030876259, + "learning_rate": 7.837062840819992e-05, + "loss": 0.9093, + "step": 2008 + }, + { + "epoch": 0.9106980961015413, + "grad_norm": 1.5396209453348517, + "learning_rate": 7.836705031325483e-05, + "loss": 0.8787, + "step": 2009 + }, + { + "epoch": 0.9111514052583862, + "grad_norm": 0.9061096488375496, + "learning_rate": 7.836346837573689e-05, + "loss": 0.9106, + "step": 2010 + }, + { + "epoch": 0.9116047144152312, + "grad_norm": 1.1583135400807418, + "learning_rate": 7.835988259600484e-05, + "loss": 0.9056, + "step": 2011 + }, + { + "epoch": 0.9120580235720761, + "grad_norm": 1.088696554693576, + "learning_rate": 7.835629297441782e-05, + "loss": 0.9363, + "step": 2012 + }, + { + "epoch": 0.9125113327289212, + "grad_norm": 1.131126391453818, + "learning_rate": 7.835269951133533e-05, + "loss": 0.9096, + "step": 2013 + }, + { + "epoch": 0.9129646418857661, + "grad_norm": 0.9493205325075973, + "learning_rate": 7.834910220711726e-05, + "loss": 0.9125, + "step": 2014 + }, + { + "epoch": 0.913417951042611, + "grad_norm": 1.1977154060555357, + "learning_rate": 7.83455010621239e-05, + "loss": 0.9121, + "step": 2015 + }, + { + "epoch": 0.9138712601994561, + "grad_norm": 1.2816978519780815, + "learning_rate": 7.834189607671592e-05, + "loss": 0.9347, + "step": 2016 + }, + { + "epoch": 0.914324569356301, + "grad_norm": 1.5159415415529491, + "learning_rate": 7.833828725125437e-05, + "loss": 0.9276, + "step": 2017 + }, + { + "epoch": 0.914777878513146, + "grad_norm": 0.8543607108804667, + "learning_rate": 7.833467458610066e-05, + "loss": 0.9265, + "step": 2018 + }, + { + "epoch": 0.9152311876699909, + "grad_norm": 1.0664333823337842, + "learning_rate": 7.833105808161663e-05, + "loss": 0.9322, + "step": 2019 + }, + { + "epoch": 0.915684496826836, + "grad_norm": 1.4518516774385624, + "learning_rate": 7.832743773816449e-05, + "loss": 0.9165, + "step": 2020 + }, + { + "epoch": 0.9161378059836809, + "grad_norm": 1.4491375808051148, + "learning_rate": 7.83238135561068e-05, + "loss": 0.9188, + "step": 2021 + }, + { + "epoch": 0.9165911151405258, + "grad_norm": 1.1109842016709826, + "learning_rate": 7.832018553580656e-05, + "loss": 0.9268, + "step": 2022 + }, + { + "epoch": 0.9170444242973708, + "grad_norm": 1.1780706654238313, + "learning_rate": 7.831655367762713e-05, + "loss": 0.9318, + "step": 2023 + }, + { + "epoch": 0.9174977334542158, + "grad_norm": 1.243395912827345, + "learning_rate": 7.831291798193223e-05, + "loss": 0.8992, + "step": 2024 + }, + { + "epoch": 0.9179510426110608, + "grad_norm": 1.693192681398444, + "learning_rate": 7.830927844908598e-05, + "loss": 0.9155, + "step": 2025 + }, + { + "epoch": 0.9184043517679057, + "grad_norm": 0.6778118755232769, + "learning_rate": 7.830563507945292e-05, + "loss": 0.9197, + "step": 2026 + }, + { + "epoch": 0.9188576609247506, + "grad_norm": 1.2273949794572174, + "learning_rate": 7.830198787339793e-05, + "loss": 0.9405, + "step": 2027 + }, + { + "epoch": 0.9193109700815957, + "grad_norm": 1.995113093100038, + "learning_rate": 7.82983368312863e-05, + "loss": 0.906, + "step": 2028 + }, + { + "epoch": 0.9197642792384406, + "grad_norm": 0.8271095546835355, + "learning_rate": 7.829468195348364e-05, + "loss": 0.9144, + "step": 2029 + }, + { + "epoch": 0.9202175883952856, + "grad_norm": 2.563433223909738, + "learning_rate": 7.829102324035606e-05, + "loss": 0.9236, + "step": 2030 + }, + { + "epoch": 0.9206708975521306, + "grad_norm": 1.5627701966698677, + "learning_rate": 7.828736069226997e-05, + "loss": 0.9275, + "step": 2031 + }, + { + "epoch": 0.9211242067089755, + "grad_norm": 2.7296710824367993, + "learning_rate": 7.828369430959218e-05, + "loss": 0.9174, + "step": 2032 + }, + { + "epoch": 0.9215775158658205, + "grad_norm": 2.7024211857838147, + "learning_rate": 7.828002409268988e-05, + "loss": 0.9473, + "step": 2033 + }, + { + "epoch": 0.9220308250226654, + "grad_norm": 1.8361234295274025, + "learning_rate": 7.827635004193068e-05, + "loss": 0.9293, + "step": 2034 + }, + { + "epoch": 0.9224841341795105, + "grad_norm": 1.428809783355935, + "learning_rate": 7.827267215768253e-05, + "loss": 0.9068, + "step": 2035 + }, + { + "epoch": 0.9229374433363554, + "grad_norm": 1.5874831759118813, + "learning_rate": 7.826899044031378e-05, + "loss": 0.8993, + "step": 2036 + }, + { + "epoch": 0.9233907524932004, + "grad_norm": 1.3359129198897113, + "learning_rate": 7.826530489019317e-05, + "loss": 0.9245, + "step": 2037 + }, + { + "epoch": 0.9238440616500453, + "grad_norm": 0.9722651297547459, + "learning_rate": 7.826161550768981e-05, + "loss": 0.9261, + "step": 2038 + }, + { + "epoch": 0.9242973708068903, + "grad_norm": 1.6712570611625996, + "learning_rate": 7.825792229317321e-05, + "loss": 0.914, + "step": 2039 + }, + { + "epoch": 0.9247506799637353, + "grad_norm": 1.163357024194584, + "learning_rate": 7.825422524701325e-05, + "loss": 0.9261, + "step": 2040 + }, + { + "epoch": 0.9252039891205802, + "grad_norm": 1.4634406551460675, + "learning_rate": 7.82505243695802e-05, + "loss": 0.8953, + "step": 2041 + }, + { + "epoch": 0.9256572982774252, + "grad_norm": 1.298249299172218, + "learning_rate": 7.824681966124473e-05, + "loss": 0.9328, + "step": 2042 + }, + { + "epoch": 0.9261106074342702, + "grad_norm": 1.7048809867193615, + "learning_rate": 7.824311112237786e-05, + "loss": 0.92, + "step": 2043 + }, + { + "epoch": 0.9265639165911151, + "grad_norm": 1.097002574606533, + "learning_rate": 7.823939875335102e-05, + "loss": 0.9132, + "step": 2044 + }, + { + "epoch": 0.9270172257479601, + "grad_norm": 1.7998620031400903, + "learning_rate": 7.8235682554536e-05, + "loss": 0.9244, + "step": 2045 + }, + { + "epoch": 0.927470534904805, + "grad_norm": 1.549324269284471, + "learning_rate": 7.823196252630501e-05, + "loss": 0.9287, + "step": 2046 + }, + { + "epoch": 0.9279238440616501, + "grad_norm": 1.2848002863591101, + "learning_rate": 7.822823866903062e-05, + "loss": 0.912, + "step": 2047 + }, + { + "epoch": 0.928377153218495, + "grad_norm": 1.6949020669410695, + "learning_rate": 7.822451098308575e-05, + "loss": 0.927, + "step": 2048 + }, + { + "epoch": 0.92883046237534, + "grad_norm": 0.9768834410288397, + "learning_rate": 7.822077946884379e-05, + "loss": 0.9224, + "step": 2049 + }, + { + "epoch": 0.929283771532185, + "grad_norm": 1.5883945529268555, + "learning_rate": 7.821704412667843e-05, + "loss": 0.9383, + "step": 2050 + }, + { + "epoch": 0.9297370806890299, + "grad_norm": 1.501720905766599, + "learning_rate": 7.821330495696378e-05, + "loss": 0.927, + "step": 2051 + }, + { + "epoch": 0.9301903898458749, + "grad_norm": 0.9098898339361701, + "learning_rate": 7.820956196007434e-05, + "loss": 0.9354, + "step": 2052 + }, + { + "epoch": 0.9306436990027198, + "grad_norm": 1.278482862037537, + "learning_rate": 7.820581513638496e-05, + "loss": 0.9316, + "step": 2053 + }, + { + "epoch": 0.9310970081595649, + "grad_norm": 1.2482751172995765, + "learning_rate": 7.820206448627092e-05, + "loss": 0.9179, + "step": 2054 + }, + { + "epoch": 0.9315503173164098, + "grad_norm": 0.962851730205354, + "learning_rate": 7.819831001010783e-05, + "loss": 0.9299, + "step": 2055 + }, + { + "epoch": 0.9320036264732547, + "grad_norm": 0.945567235695268, + "learning_rate": 7.819455170827176e-05, + "loss": 0.9193, + "step": 2056 + }, + { + "epoch": 0.9324569356300997, + "grad_norm": 0.9950976759208129, + "learning_rate": 7.819078958113906e-05, + "loss": 0.9118, + "step": 2057 + }, + { + "epoch": 0.9329102447869447, + "grad_norm": 1.2769028079949594, + "learning_rate": 7.818702362908656e-05, + "loss": 0.9106, + "step": 2058 + }, + { + "epoch": 0.9333635539437897, + "grad_norm": 1.0418318765631487, + "learning_rate": 7.81832538524914e-05, + "loss": 0.9008, + "step": 2059 + }, + { + "epoch": 0.9338168631006346, + "grad_norm": 1.417323799259494, + "learning_rate": 7.817948025173115e-05, + "loss": 0.9163, + "step": 2060 + }, + { + "epoch": 0.9342701722574795, + "grad_norm": 0.8815784612032971, + "learning_rate": 7.817570282718376e-05, + "loss": 0.9298, + "step": 2061 + }, + { + "epoch": 0.9347234814143246, + "grad_norm": 1.0787158088381572, + "learning_rate": 7.817192157922753e-05, + "loss": 0.9245, + "step": 2062 + }, + { + "epoch": 0.9351767905711695, + "grad_norm": 1.2657339195989432, + "learning_rate": 7.816813650824115e-05, + "loss": 0.9176, + "step": 2063 + }, + { + "epoch": 0.9356300997280145, + "grad_norm": 1.124232128480022, + "learning_rate": 7.816434761460373e-05, + "loss": 0.9051, + "step": 2064 + }, + { + "epoch": 0.9360834088848595, + "grad_norm": 1.262621559009067, + "learning_rate": 7.816055489869475e-05, + "loss": 0.8969, + "step": 2065 + }, + { + "epoch": 0.9365367180417045, + "grad_norm": 1.0373148561695584, + "learning_rate": 7.815675836089405e-05, + "loss": 0.9206, + "step": 2066 + }, + { + "epoch": 0.9369900271985494, + "grad_norm": 1.2388672399975567, + "learning_rate": 7.815295800158184e-05, + "loss": 0.8941, + "step": 2067 + }, + { + "epoch": 0.9374433363553943, + "grad_norm": 0.9015962308792302, + "learning_rate": 7.814915382113877e-05, + "loss": 0.9174, + "step": 2068 + }, + { + "epoch": 0.9378966455122394, + "grad_norm": 1.1451217243930738, + "learning_rate": 7.814534581994582e-05, + "loss": 0.9369, + "step": 2069 + }, + { + "epoch": 0.9383499546690843, + "grad_norm": 1.494771989405895, + "learning_rate": 7.814153399838437e-05, + "loss": 0.926, + "step": 2070 + }, + { + "epoch": 0.9388032638259293, + "grad_norm": 0.7828430186285001, + "learning_rate": 7.81377183568362e-05, + "loss": 0.9037, + "step": 2071 + }, + { + "epoch": 0.9392565729827742, + "grad_norm": 0.9180017247655212, + "learning_rate": 7.813389889568345e-05, + "loss": 0.9043, + "step": 2072 + }, + { + "epoch": 0.9397098821396193, + "grad_norm": 0.9574009487647769, + "learning_rate": 7.813007561530866e-05, + "loss": 0.9325, + "step": 2073 + }, + { + "epoch": 0.9401631912964642, + "grad_norm": 0.8221090824475117, + "learning_rate": 7.812624851609472e-05, + "loss": 0.9236, + "step": 2074 + }, + { + "epoch": 0.9406165004533091, + "grad_norm": 1.214653225766719, + "learning_rate": 7.812241759842496e-05, + "loss": 0.9046, + "step": 2075 + }, + { + "epoch": 0.9410698096101541, + "grad_norm": 1.1055429364825522, + "learning_rate": 7.811858286268303e-05, + "loss": 0.9053, + "step": 2076 + }, + { + "epoch": 0.9415231187669991, + "grad_norm": 1.4509151457640221, + "learning_rate": 7.8114744309253e-05, + "loss": 0.9173, + "step": 2077 + }, + { + "epoch": 0.9419764279238441, + "grad_norm": 1.0416451180306128, + "learning_rate": 7.811090193851931e-05, + "loss": 0.911, + "step": 2078 + }, + { + "epoch": 0.942429737080689, + "grad_norm": 1.1678126104125133, + "learning_rate": 7.810705575086678e-05, + "loss": 0.9003, + "step": 2079 + }, + { + "epoch": 0.942883046237534, + "grad_norm": 1.358430294566764, + "learning_rate": 7.810320574668062e-05, + "loss": 0.8952, + "step": 2080 + }, + { + "epoch": 0.943336355394379, + "grad_norm": 0.9188379503403183, + "learning_rate": 7.809935192634643e-05, + "loss": 0.9018, + "step": 2081 + }, + { + "epoch": 0.9437896645512239, + "grad_norm": 1.1614789723605714, + "learning_rate": 7.809549429025017e-05, + "loss": 0.9152, + "step": 2082 + }, + { + "epoch": 0.9442429737080689, + "grad_norm": 1.3977685369526134, + "learning_rate": 7.80916328387782e-05, + "loss": 0.9173, + "step": 2083 + }, + { + "epoch": 0.9446962828649139, + "grad_norm": 0.7419204567584518, + "learning_rate": 7.808776757231726e-05, + "loss": 0.9094, + "step": 2084 + }, + { + "epoch": 0.9451495920217589, + "grad_norm": 1.038700963820599, + "learning_rate": 7.808389849125442e-05, + "loss": 0.9242, + "step": 2085 + }, + { + "epoch": 0.9456029011786038, + "grad_norm": 1.55581888113832, + "learning_rate": 7.808002559597725e-05, + "loss": 0.9124, + "step": 2086 + }, + { + "epoch": 0.9460562103354487, + "grad_norm": 0.919003261652116, + "learning_rate": 7.80761488868736e-05, + "loss": 0.915, + "step": 2087 + }, + { + "epoch": 0.9465095194922938, + "grad_norm": 1.6614298071717641, + "learning_rate": 7.807226836433174e-05, + "loss": 0.925, + "step": 2088 + }, + { + "epoch": 0.9469628286491387, + "grad_norm": 0.7627365858748864, + "learning_rate": 7.80683840287403e-05, + "loss": 0.9111, + "step": 2089 + }, + { + "epoch": 0.9474161378059837, + "grad_norm": 1.4599074222335378, + "learning_rate": 7.806449588048833e-05, + "loss": 0.925, + "step": 2090 + }, + { + "epoch": 0.9478694469628286, + "grad_norm": 0.9113280825679893, + "learning_rate": 7.806060391996522e-05, + "loss": 0.912, + "step": 2091 + }, + { + "epoch": 0.9483227561196736, + "grad_norm": 1.7599646157165878, + "learning_rate": 7.805670814756076e-05, + "loss": 0.9003, + "step": 2092 + }, + { + "epoch": 0.9487760652765186, + "grad_norm": 1.0279100439178879, + "learning_rate": 7.805280856366514e-05, + "loss": 0.8983, + "step": 2093 + }, + { + "epoch": 0.9492293744333635, + "grad_norm": 1.834157761292403, + "learning_rate": 7.804890516866891e-05, + "loss": 0.9042, + "step": 2094 + }, + { + "epoch": 0.9496826835902086, + "grad_norm": 1.47396455696421, + "learning_rate": 7.8044997962963e-05, + "loss": 0.9257, + "step": 2095 + }, + { + "epoch": 0.9501359927470535, + "grad_norm": 1.707625611609699, + "learning_rate": 7.804108694693874e-05, + "loss": 0.9229, + "step": 2096 + }, + { + "epoch": 0.9505893019038985, + "grad_norm": 1.5171776073479208, + "learning_rate": 7.803717212098782e-05, + "loss": 0.931, + "step": 2097 + }, + { + "epoch": 0.9510426110607434, + "grad_norm": 1.4888033208179747, + "learning_rate": 7.803325348550233e-05, + "loss": 0.9234, + "step": 2098 + }, + { + "epoch": 0.9514959202175884, + "grad_norm": 1.3231390706403896, + "learning_rate": 7.80293310408747e-05, + "loss": 0.9223, + "step": 2099 + }, + { + "epoch": 0.9519492293744334, + "grad_norm": 1.2486753216920425, + "learning_rate": 7.802540478749782e-05, + "loss": 0.9064, + "step": 2100 + }, + { + "epoch": 0.9524025385312783, + "grad_norm": 1.1274407786641953, + "learning_rate": 7.802147472576489e-05, + "loss": 0.9173, + "step": 2101 + }, + { + "epoch": 0.9528558476881233, + "grad_norm": 1.2586224091020084, + "learning_rate": 7.801754085606952e-05, + "loss": 0.9243, + "step": 2102 + }, + { + "epoch": 0.9533091568449683, + "grad_norm": 1.5011901728348551, + "learning_rate": 7.801360317880572e-05, + "loss": 0.9001, + "step": 2103 + }, + { + "epoch": 0.9537624660018132, + "grad_norm": 1.3232696890060862, + "learning_rate": 7.800966169436781e-05, + "loss": 0.9301, + "step": 2104 + }, + { + "epoch": 0.9542157751586582, + "grad_norm": 0.9292018390649422, + "learning_rate": 7.80057164031506e-05, + "loss": 0.9137, + "step": 2105 + }, + { + "epoch": 0.9546690843155031, + "grad_norm": 1.8011605681197995, + "learning_rate": 7.800176730554919e-05, + "loss": 0.9092, + "step": 2106 + }, + { + "epoch": 0.9551223934723482, + "grad_norm": 1.0746033198367946, + "learning_rate": 7.79978144019591e-05, + "loss": 0.9059, + "step": 2107 + }, + { + "epoch": 0.9555757026291931, + "grad_norm": 1.3037789972587326, + "learning_rate": 7.799385769277621e-05, + "loss": 0.893, + "step": 2108 + }, + { + "epoch": 0.956029011786038, + "grad_norm": 1.1686674618103532, + "learning_rate": 7.798989717839682e-05, + "loss": 0.9067, + "step": 2109 + }, + { + "epoch": 0.956482320942883, + "grad_norm": 1.2518188061513011, + "learning_rate": 7.798593285921757e-05, + "loss": 0.9313, + "step": 2110 + }, + { + "epoch": 0.956935630099728, + "grad_norm": 1.586823001222998, + "learning_rate": 7.798196473563552e-05, + "loss": 0.9154, + "step": 2111 + }, + { + "epoch": 0.957388939256573, + "grad_norm": 1.0722388892936208, + "learning_rate": 7.797799280804807e-05, + "loss": 0.9147, + "step": 2112 + }, + { + "epoch": 0.9578422484134179, + "grad_norm": 0.8375675302892033, + "learning_rate": 7.797401707685302e-05, + "loss": 0.9152, + "step": 2113 + }, + { + "epoch": 0.958295557570263, + "grad_norm": 1.4175599263106644, + "learning_rate": 7.797003754244856e-05, + "loss": 0.9355, + "step": 2114 + }, + { + "epoch": 0.9587488667271079, + "grad_norm": 1.3701079004809034, + "learning_rate": 7.796605420523324e-05, + "loss": 0.9125, + "step": 2115 + }, + { + "epoch": 0.9592021758839528, + "grad_norm": 0.8899366841966995, + "learning_rate": 7.796206706560602e-05, + "loss": 0.9176, + "step": 2116 + }, + { + "epoch": 0.9596554850407978, + "grad_norm": 1.3156374734731617, + "learning_rate": 7.795807612396621e-05, + "loss": 0.9093, + "step": 2117 + }, + { + "epoch": 0.9601087941976428, + "grad_norm": 1.116853026460026, + "learning_rate": 7.795408138071352e-05, + "loss": 0.9289, + "step": 2118 + }, + { + "epoch": 0.9605621033544878, + "grad_norm": 1.1251138021677896, + "learning_rate": 7.795008283624804e-05, + "loss": 0.8893, + "step": 2119 + }, + { + "epoch": 0.9610154125113327, + "grad_norm": 1.4682909967362272, + "learning_rate": 7.794608049097023e-05, + "loss": 0.9206, + "step": 2120 + }, + { + "epoch": 0.9614687216681777, + "grad_norm": 1.2829656286845377, + "learning_rate": 7.794207434528094e-05, + "loss": 0.9266, + "step": 2121 + }, + { + "epoch": 0.9619220308250227, + "grad_norm": 1.0067194099581505, + "learning_rate": 7.79380643995814e-05, + "loss": 0.9294, + "step": 2122 + }, + { + "epoch": 0.9623753399818676, + "grad_norm": 1.5589323070280092, + "learning_rate": 7.793405065427319e-05, + "loss": 0.915, + "step": 2123 + }, + { + "epoch": 0.9628286491387126, + "grad_norm": 1.3503121432410896, + "learning_rate": 7.793003310975832e-05, + "loss": 0.9258, + "step": 2124 + }, + { + "epoch": 0.9632819582955575, + "grad_norm": 1.3783276183337374, + "learning_rate": 7.79260117664392e-05, + "loss": 0.9068, + "step": 2125 + }, + { + "epoch": 0.9637352674524026, + "grad_norm": 1.138457795312723, + "learning_rate": 7.792198662471849e-05, + "loss": 0.9316, + "step": 2126 + }, + { + "epoch": 0.9641885766092475, + "grad_norm": 1.301988403670565, + "learning_rate": 7.79179576849994e-05, + "loss": 0.913, + "step": 2127 + }, + { + "epoch": 0.9646418857660924, + "grad_norm": 1.000087106048032, + "learning_rate": 7.79139249476854e-05, + "loss": 0.9355, + "step": 2128 + }, + { + "epoch": 0.9650951949229375, + "grad_norm": 1.4736344885644197, + "learning_rate": 7.790988841318039e-05, + "loss": 0.9125, + "step": 2129 + }, + { + "epoch": 0.9655485040797824, + "grad_norm": 1.2849744368033131, + "learning_rate": 7.790584808188864e-05, + "loss": 0.9047, + "step": 2130 + }, + { + "epoch": 0.9660018132366274, + "grad_norm": 1.0260240895553023, + "learning_rate": 7.79018039542148e-05, + "loss": 0.933, + "step": 2131 + }, + { + "epoch": 0.9664551223934723, + "grad_norm": 1.2749394866630974, + "learning_rate": 7.78977560305639e-05, + "loss": 0.9346, + "step": 2132 + }, + { + "epoch": 0.9669084315503174, + "grad_norm": 1.2820611389855099, + "learning_rate": 7.789370431134137e-05, + "loss": 0.9332, + "step": 2133 + }, + { + "epoch": 0.9673617407071623, + "grad_norm": 1.270380924830418, + "learning_rate": 7.788964879695297e-05, + "loss": 0.9247, + "step": 2134 + }, + { + "epoch": 0.9678150498640072, + "grad_norm": 1.119537695743955, + "learning_rate": 7.788558948780489e-05, + "loss": 0.9411, + "step": 2135 + }, + { + "epoch": 0.9682683590208522, + "grad_norm": 1.9938783978467594, + "learning_rate": 7.788152638430368e-05, + "loss": 0.9091, + "step": 2136 + }, + { + "epoch": 0.9687216681776972, + "grad_norm": 0.9552824489174617, + "learning_rate": 7.787745948685628e-05, + "loss": 0.9183, + "step": 2137 + }, + { + "epoch": 0.9691749773345422, + "grad_norm": 2.162970207394887, + "learning_rate": 7.787338879586997e-05, + "loss": 0.8935, + "step": 2138 + }, + { + "epoch": 0.9696282864913871, + "grad_norm": 1.5071513852193574, + "learning_rate": 7.786931431175248e-05, + "loss": 0.9238, + "step": 2139 + }, + { + "epoch": 0.970081595648232, + "grad_norm": 2.170136250527026, + "learning_rate": 7.786523603491186e-05, + "loss": 0.929, + "step": 2140 + }, + { + "epoch": 0.9705349048050771, + "grad_norm": 1.6531311777914721, + "learning_rate": 7.786115396575657e-05, + "loss": 0.9098, + "step": 2141 + }, + { + "epoch": 0.970988213961922, + "grad_norm": 1.8240418694804739, + "learning_rate": 7.785706810469545e-05, + "loss": 0.9238, + "step": 2142 + }, + { + "epoch": 0.971441523118767, + "grad_norm": 1.431986152326375, + "learning_rate": 7.785297845213768e-05, + "loss": 0.9243, + "step": 2143 + }, + { + "epoch": 0.971894832275612, + "grad_norm": 2.000603034835783, + "learning_rate": 7.784888500849289e-05, + "loss": 0.9342, + "step": 2144 + }, + { + "epoch": 0.972348141432457, + "grad_norm": 1.5308427617419182, + "learning_rate": 7.784478777417102e-05, + "loss": 0.9183, + "step": 2145 + }, + { + "epoch": 0.9728014505893019, + "grad_norm": 1.7363192544268813, + "learning_rate": 7.784068674958243e-05, + "loss": 0.9118, + "step": 2146 + }, + { + "epoch": 0.9732547597461468, + "grad_norm": 1.3677834108002378, + "learning_rate": 7.783658193513785e-05, + "loss": 0.9328, + "step": 2147 + }, + { + "epoch": 0.9737080689029919, + "grad_norm": 1.7186860883724848, + "learning_rate": 7.783247333124838e-05, + "loss": 0.9149, + "step": 2148 + }, + { + "epoch": 0.9741613780598368, + "grad_norm": 1.2238881205558976, + "learning_rate": 7.782836093832552e-05, + "loss": 0.9365, + "step": 2149 + }, + { + "epoch": 0.9746146872166818, + "grad_norm": 1.3534348287786235, + "learning_rate": 7.782424475678115e-05, + "loss": 0.9451, + "step": 2150 + }, + { + "epoch": 0.9750679963735267, + "grad_norm": 1.1052672191599187, + "learning_rate": 7.782012478702751e-05, + "loss": 0.9317, + "step": 2151 + }, + { + "epoch": 0.9755213055303718, + "grad_norm": 1.2303786076626344, + "learning_rate": 7.781600102947722e-05, + "loss": 0.9209, + "step": 2152 + }, + { + "epoch": 0.9759746146872167, + "grad_norm": 0.9382553039596676, + "learning_rate": 7.781187348454327e-05, + "loss": 0.9036, + "step": 2153 + }, + { + "epoch": 0.9764279238440616, + "grad_norm": 1.2953614727494012, + "learning_rate": 7.780774215263908e-05, + "loss": 0.9219, + "step": 2154 + }, + { + "epoch": 0.9768812330009066, + "grad_norm": 1.350788658773205, + "learning_rate": 7.780360703417839e-05, + "loss": 0.943, + "step": 2155 + }, + { + "epoch": 0.9773345421577516, + "grad_norm": 0.9159495396952788, + "learning_rate": 7.779946812957534e-05, + "loss": 0.9061, + "step": 2156 + }, + { + "epoch": 0.9777878513145966, + "grad_norm": 1.7956150390992585, + "learning_rate": 7.779532543924448e-05, + "loss": 0.9088, + "step": 2157 + }, + { + "epoch": 0.9782411604714415, + "grad_norm": 1.134703665517406, + "learning_rate": 7.77911789636007e-05, + "loss": 0.912, + "step": 2158 + }, + { + "epoch": 0.9786944696282865, + "grad_norm": 1.6927003256802784, + "learning_rate": 7.778702870305928e-05, + "loss": 0.93, + "step": 2159 + }, + { + "epoch": 0.9791477787851315, + "grad_norm": 1.4289654003874595, + "learning_rate": 7.778287465803587e-05, + "loss": 0.9139, + "step": 2160 + }, + { + "epoch": 0.9796010879419764, + "grad_norm": 1.5734075035384878, + "learning_rate": 7.777871682894654e-05, + "loss": 0.8953, + "step": 2161 + }, + { + "epoch": 0.9800543970988214, + "grad_norm": 1.506554520846465, + "learning_rate": 7.777455521620769e-05, + "loss": 0.8895, + "step": 2162 + }, + { + "epoch": 0.9805077062556664, + "grad_norm": 1.385587339502483, + "learning_rate": 7.77703898202361e-05, + "loss": 0.9061, + "step": 2163 + }, + { + "epoch": 0.9809610154125114, + "grad_norm": 1.111357073090794, + "learning_rate": 7.776622064144897e-05, + "loss": 0.901, + "step": 2164 + }, + { + "epoch": 0.9814143245693563, + "grad_norm": 1.6120495691512586, + "learning_rate": 7.776204768026385e-05, + "loss": 0.9304, + "step": 2165 + }, + { + "epoch": 0.9818676337262012, + "grad_norm": 0.9865939177380262, + "learning_rate": 7.775787093709868e-05, + "loss": 0.8954, + "step": 2166 + }, + { + "epoch": 0.9823209428830463, + "grad_norm": 1.6466174080538547, + "learning_rate": 7.775369041237176e-05, + "loss": 0.9184, + "step": 2167 + }, + { + "epoch": 0.9827742520398912, + "grad_norm": 1.1676816688905982, + "learning_rate": 7.77495061065018e-05, + "loss": 0.8997, + "step": 2168 + }, + { + "epoch": 0.9832275611967362, + "grad_norm": 1.435193221174988, + "learning_rate": 7.774531801990785e-05, + "loss": 0.9363, + "step": 2169 + }, + { + "epoch": 0.9836808703535811, + "grad_norm": 1.4249992507220068, + "learning_rate": 7.774112615300937e-05, + "loss": 0.9154, + "step": 2170 + }, + { + "epoch": 0.9841341795104261, + "grad_norm": 1.1664267552168033, + "learning_rate": 7.773693050622619e-05, + "loss": 0.9114, + "step": 2171 + }, + { + "epoch": 0.9845874886672711, + "grad_norm": 1.2961778449704024, + "learning_rate": 7.773273107997852e-05, + "loss": 0.9009, + "step": 2172 + }, + { + "epoch": 0.985040797824116, + "grad_norm": 1.1089153579771993, + "learning_rate": 7.772852787468693e-05, + "loss": 0.9071, + "step": 2173 + }, + { + "epoch": 0.985494106980961, + "grad_norm": 0.9737238839715389, + "learning_rate": 7.772432089077239e-05, + "loss": 0.9358, + "step": 2174 + }, + { + "epoch": 0.985947416137806, + "grad_norm": 1.3153996805282484, + "learning_rate": 7.772011012865624e-05, + "loss": 0.9012, + "step": 2175 + }, + { + "epoch": 0.986400725294651, + "grad_norm": 1.1486490387850294, + "learning_rate": 7.77158955887602e-05, + "loss": 0.9189, + "step": 2176 + }, + { + "epoch": 0.9868540344514959, + "grad_norm": 1.0995814570715765, + "learning_rate": 7.771167727150639e-05, + "loss": 0.9073, + "step": 2177 + }, + { + "epoch": 0.9873073436083409, + "grad_norm": 1.6598417496182507, + "learning_rate": 7.770745517731726e-05, + "loss": 0.9165, + "step": 2178 + }, + { + "epoch": 0.9877606527651859, + "grad_norm": 0.7824188656677149, + "learning_rate": 7.770322930661566e-05, + "loss": 0.8973, + "step": 2179 + }, + { + "epoch": 0.9882139619220308, + "grad_norm": 1.5409921615137672, + "learning_rate": 7.769899965982486e-05, + "loss": 0.9192, + "step": 2180 + }, + { + "epoch": 0.9886672710788758, + "grad_norm": 1.0604310416736975, + "learning_rate": 7.769476623736843e-05, + "loss": 0.9203, + "step": 2181 + }, + { + "epoch": 0.9891205802357208, + "grad_norm": 1.5093408369707018, + "learning_rate": 7.769052903967039e-05, + "loss": 0.898, + "step": 2182 + }, + { + "epoch": 0.9895738893925657, + "grad_norm": 1.0961621594675746, + "learning_rate": 7.768628806715509e-05, + "loss": 0.8935, + "step": 2183 + }, + { + "epoch": 0.9900271985494107, + "grad_norm": 1.0916652263890707, + "learning_rate": 7.76820433202473e-05, + "loss": 0.9252, + "step": 2184 + }, + { + "epoch": 0.9904805077062556, + "grad_norm": 1.2429591267128823, + "learning_rate": 7.767779479937209e-05, + "loss": 0.9137, + "step": 2185 + }, + { + "epoch": 0.9909338168631007, + "grad_norm": 1.3963081484600093, + "learning_rate": 7.767354250495503e-05, + "loss": 0.9442, + "step": 2186 + }, + { + "epoch": 0.9913871260199456, + "grad_norm": 0.9061475553676106, + "learning_rate": 7.766928643742195e-05, + "loss": 0.9147, + "step": 2187 + }, + { + "epoch": 0.9918404351767905, + "grad_norm": 1.0183968396184193, + "learning_rate": 7.766502659719914e-05, + "loss": 0.9313, + "step": 2188 + }, + { + "epoch": 0.9922937443336355, + "grad_norm": 1.0487807724081928, + "learning_rate": 7.766076298471321e-05, + "loss": 0.8972, + "step": 2189 + }, + { + "epoch": 0.9927470534904805, + "grad_norm": 1.1055296900394975, + "learning_rate": 7.765649560039119e-05, + "loss": 0.915, + "step": 2190 + }, + { + "epoch": 0.9932003626473255, + "grad_norm": 0.9652677239035385, + "learning_rate": 7.765222444466046e-05, + "loss": 0.9201, + "step": 2191 + }, + { + "epoch": 0.9936536718041704, + "grad_norm": 1.1788439268482402, + "learning_rate": 7.764794951794879e-05, + "loss": 0.9048, + "step": 2192 + }, + { + "epoch": 0.9941069809610155, + "grad_norm": 0.9870610084492013, + "learning_rate": 7.764367082068435e-05, + "loss": 0.9188, + "step": 2193 + }, + { + "epoch": 0.9945602901178604, + "grad_norm": 0.8529555795352023, + "learning_rate": 7.763938835329563e-05, + "loss": 0.9053, + "step": 2194 + }, + { + "epoch": 0.9950135992747053, + "grad_norm": 1.1980789220927577, + "learning_rate": 7.763510211621153e-05, + "loss": 0.9065, + "step": 2195 + }, + { + "epoch": 0.9954669084315503, + "grad_norm": 1.0324930518255682, + "learning_rate": 7.763081210986137e-05, + "loss": 0.9267, + "step": 2196 + }, + { + "epoch": 0.9959202175883953, + "grad_norm": 1.6759204226838986, + "learning_rate": 7.762651833467477e-05, + "loss": 0.9087, + "step": 2197 + }, + { + "epoch": 0.9963735267452403, + "grad_norm": 0.9363873854731483, + "learning_rate": 7.762222079108178e-05, + "loss": 0.9154, + "step": 2198 + }, + { + "epoch": 0.9968268359020852, + "grad_norm": 0.9101739183691779, + "learning_rate": 7.761791947951282e-05, + "loss": 0.894, + "step": 2199 + }, + { + "epoch": 0.9972801450589301, + "grad_norm": 1.1971256271876236, + "learning_rate": 7.761361440039866e-05, + "loss": 0.9173, + "step": 2200 + }, + { + "epoch": 0.9977334542157752, + "grad_norm": 1.441666744955298, + "learning_rate": 7.760930555417046e-05, + "loss": 0.9192, + "step": 2201 + }, + { + "epoch": 0.9981867633726201, + "grad_norm": 1.4124953002832787, + "learning_rate": 7.760499294125978e-05, + "loss": 0.922, + "step": 2202 + }, + { + "epoch": 0.9986400725294651, + "grad_norm": 0.6924001270512046, + "learning_rate": 7.760067656209856e-05, + "loss": 0.91, + "step": 2203 + }, + { + "epoch": 0.99909338168631, + "grad_norm": 1.0878264239184325, + "learning_rate": 7.759635641711905e-05, + "loss": 0.8992, + "step": 2204 + }, + { + "epoch": 0.9995466908431551, + "grad_norm": 1.7918154205156307, + "learning_rate": 7.759203250675397e-05, + "loss": 0.9229, + "step": 2205 + }, + { + "epoch": 1.0, + "grad_norm": 0.8001246597792144, + "learning_rate": 7.758770483143634e-05, + "loss": 0.9126, + "step": 2206 + }, + { + "epoch": 1.000453309156845, + "grad_norm": 1.5185579693502667, + "learning_rate": 7.758337339159961e-05, + "loss": 0.8875, + "step": 2207 + }, + { + "epoch": 1.0009066183136899, + "grad_norm": 1.0933507833824967, + "learning_rate": 7.757903818767759e-05, + "loss": 0.8931, + "step": 2208 + }, + { + "epoch": 1.001359927470535, + "grad_norm": 1.8863710760456849, + "learning_rate": 7.757469922010442e-05, + "loss": 0.8899, + "step": 2209 + }, + { + "epoch": 1.00181323662738, + "grad_norm": 0.9833619747869214, + "learning_rate": 7.757035648931473e-05, + "loss": 0.9037, + "step": 2210 + }, + { + "epoch": 1.0022665457842248, + "grad_norm": 2.1399333316083426, + "learning_rate": 7.756600999574339e-05, + "loss": 0.9109, + "step": 2211 + }, + { + "epoch": 1.0027198549410699, + "grad_norm": 2.0005771211578534, + "learning_rate": 7.756165973982576e-05, + "loss": 0.9107, + "step": 2212 + }, + { + "epoch": 1.0031731640979147, + "grad_norm": 1.5542662023629812, + "learning_rate": 7.755730572199753e-05, + "loss": 0.9068, + "step": 2213 + }, + { + "epoch": 1.0036264732547597, + "grad_norm": 2.175806139743037, + "learning_rate": 7.755294794269474e-05, + "loss": 0.8979, + "step": 2214 + }, + { + "epoch": 1.0040797824116048, + "grad_norm": 1.7977508796479036, + "learning_rate": 7.754858640235384e-05, + "loss": 0.9194, + "step": 2215 + }, + { + "epoch": 1.0045330915684496, + "grad_norm": 2.210366813675899, + "learning_rate": 7.754422110141165e-05, + "loss": 0.924, + "step": 2216 + }, + { + "epoch": 1.0049864007252947, + "grad_norm": 1.853332220541622, + "learning_rate": 7.75398520403054e-05, + "loss": 0.8976, + "step": 2217 + }, + { + "epoch": 1.0054397098821397, + "grad_norm": 2.2972341605743694, + "learning_rate": 7.753547921947263e-05, + "loss": 0.9, + "step": 2218 + }, + { + "epoch": 1.0058930190389845, + "grad_norm": 1.633886802763704, + "learning_rate": 7.75311026393513e-05, + "loss": 0.8844, + "step": 2219 + }, + { + "epoch": 1.0063463281958296, + "grad_norm": 2.524998367041391, + "learning_rate": 7.752672230037973e-05, + "loss": 0.8821, + "step": 2220 + }, + { + "epoch": 1.0067996373526744, + "grad_norm": 2.0718755194235468, + "learning_rate": 7.752233820299665e-05, + "loss": 0.9025, + "step": 2221 + }, + { + "epoch": 1.0072529465095195, + "grad_norm": 2.2298908250180824, + "learning_rate": 7.751795034764111e-05, + "loss": 0.8973, + "step": 2222 + }, + { + "epoch": 1.0077062556663645, + "grad_norm": 1.9356365820254164, + "learning_rate": 7.75135587347526e-05, + "loss": 0.8844, + "step": 2223 + }, + { + "epoch": 1.0081595648232093, + "grad_norm": 2.3273615763303876, + "learning_rate": 7.750916336477091e-05, + "loss": 0.9057, + "step": 2224 + }, + { + "epoch": 1.0086128739800544, + "grad_norm": 1.9799922306252278, + "learning_rate": 7.750476423813627e-05, + "loss": 0.9001, + "step": 2225 + }, + { + "epoch": 1.0090661831368994, + "grad_norm": 2.0869692332820353, + "learning_rate": 7.750036135528928e-05, + "loss": 0.8881, + "step": 2226 + }, + { + "epoch": 1.0095194922937443, + "grad_norm": 1.6856188548995934, + "learning_rate": 7.749595471667089e-05, + "loss": 0.9074, + "step": 2227 + }, + { + "epoch": 1.0099728014505893, + "grad_norm": 2.4435857441160196, + "learning_rate": 7.749154432272243e-05, + "loss": 0.9041, + "step": 2228 + }, + { + "epoch": 1.0104261106074344, + "grad_norm": 2.124321890291241, + "learning_rate": 7.748713017388561e-05, + "loss": 0.8972, + "step": 2229 + }, + { + "epoch": 1.0108794197642792, + "grad_norm": 2.1068105077788317, + "learning_rate": 7.748271227060255e-05, + "loss": 0.9126, + "step": 2230 + }, + { + "epoch": 1.0113327289211242, + "grad_norm": 2.066471803816435, + "learning_rate": 7.747829061331569e-05, + "loss": 0.9293, + "step": 2231 + }, + { + "epoch": 1.011786038077969, + "grad_norm": 1.8039954906125946, + "learning_rate": 7.747386520246788e-05, + "loss": 0.9032, + "step": 2232 + }, + { + "epoch": 1.0122393472348141, + "grad_norm": 1.5807654626841818, + "learning_rate": 7.746943603850233e-05, + "loss": 0.9019, + "step": 2233 + }, + { + "epoch": 1.0126926563916592, + "grad_norm": 2.3804846364971013, + "learning_rate": 7.746500312186265e-05, + "loss": 0.903, + "step": 2234 + }, + { + "epoch": 1.013145965548504, + "grad_norm": 2.1307147315297237, + "learning_rate": 7.74605664529928e-05, + "loss": 0.8944, + "step": 2235 + }, + { + "epoch": 1.013599274705349, + "grad_norm": 1.8019974699856864, + "learning_rate": 7.74561260323371e-05, + "loss": 0.9054, + "step": 2236 + }, + { + "epoch": 1.014052583862194, + "grad_norm": 1.5792102590987154, + "learning_rate": 7.745168186034031e-05, + "loss": 0.909, + "step": 2237 + }, + { + "epoch": 1.014505893019039, + "grad_norm": 2.4276543434651425, + "learning_rate": 7.744723393744753e-05, + "loss": 0.9168, + "step": 2238 + }, + { + "epoch": 1.014959202175884, + "grad_norm": 2.1863054099445702, + "learning_rate": 7.744278226410421e-05, + "loss": 0.9102, + "step": 2239 + }, + { + "epoch": 1.015412511332729, + "grad_norm": 1.6551878936554276, + "learning_rate": 7.743832684075619e-05, + "loss": 0.894, + "step": 2240 + }, + { + "epoch": 1.0158658204895739, + "grad_norm": 1.4720541685309292, + "learning_rate": 7.743386766784971e-05, + "loss": 0.904, + "step": 2241 + }, + { + "epoch": 1.016319129646419, + "grad_norm": 2.3303368847634838, + "learning_rate": 7.742940474583138e-05, + "loss": 0.8956, + "step": 2242 + }, + { + "epoch": 1.0167724388032637, + "grad_norm": 2.114358412758921, + "learning_rate": 7.742493807514816e-05, + "loss": 0.9016, + "step": 2243 + }, + { + "epoch": 1.0172257479601088, + "grad_norm": 1.711440800557558, + "learning_rate": 7.74204676562474e-05, + "loss": 0.9022, + "step": 2244 + }, + { + "epoch": 1.0176790571169538, + "grad_norm": 1.4505947171433313, + "learning_rate": 7.741599348957682e-05, + "loss": 0.8938, + "step": 2245 + }, + { + "epoch": 1.0181323662737987, + "grad_norm": 2.299696009950017, + "learning_rate": 7.741151557558453e-05, + "loss": 0.9039, + "step": 2246 + }, + { + "epoch": 1.0185856754306437, + "grad_norm": 1.9991585483115963, + "learning_rate": 7.740703391471901e-05, + "loss": 0.8908, + "step": 2247 + }, + { + "epoch": 1.0190389845874888, + "grad_norm": 1.7471115864747764, + "learning_rate": 7.74025485074291e-05, + "loss": 0.9081, + "step": 2248 + }, + { + "epoch": 1.0194922937443336, + "grad_norm": 1.5589193681773093, + "learning_rate": 7.739805935416403e-05, + "loss": 0.8848, + "step": 2249 + }, + { + "epoch": 1.0199456029011786, + "grad_norm": 2.1584689226691904, + "learning_rate": 7.739356645537341e-05, + "loss": 0.8798, + "step": 2250 + }, + { + "epoch": 1.0203989120580235, + "grad_norm": 1.9412375402706006, + "learning_rate": 7.738906981150722e-05, + "loss": 0.8937, + "step": 2251 + }, + { + "epoch": 1.0208522212148685, + "grad_norm": 1.8215949759328434, + "learning_rate": 7.73845694230158e-05, + "loss": 0.8989, + "step": 2252 + }, + { + "epoch": 1.0213055303717136, + "grad_norm": 1.5748510760355887, + "learning_rate": 7.738006529034988e-05, + "loss": 0.9251, + "step": 2253 + }, + { + "epoch": 1.0217588395285584, + "grad_norm": 2.3294748424644185, + "learning_rate": 7.737555741396055e-05, + "loss": 0.9614, + "step": 2254 + }, + { + "epoch": 1.0222121486854034, + "grad_norm": 2.216438206850962, + "learning_rate": 7.73710457942993e-05, + "loss": 0.9025, + "step": 2255 + }, + { + "epoch": 1.0226654578422485, + "grad_norm": 1.3267813490440317, + "learning_rate": 7.736653043181801e-05, + "loss": 0.9056, + "step": 2256 + }, + { + "epoch": 1.0231187669990933, + "grad_norm": 1.0981707099163063, + "learning_rate": 7.736201132696886e-05, + "loss": 0.8839, + "step": 2257 + }, + { + "epoch": 1.0235720761559384, + "grad_norm": 2.478351679818519, + "learning_rate": 7.735748848020447e-05, + "loss": 0.9099, + "step": 2258 + }, + { + "epoch": 1.0240253853127834, + "grad_norm": 2.231708844544931, + "learning_rate": 7.735296189197781e-05, + "loss": 0.9035, + "step": 2259 + }, + { + "epoch": 1.0244786944696282, + "grad_norm": 1.7311914536654498, + "learning_rate": 7.734843156274225e-05, + "loss": 0.9061, + "step": 2260 + }, + { + "epoch": 1.0249320036264733, + "grad_norm": 1.7117349612462849, + "learning_rate": 7.73438974929515e-05, + "loss": 0.9019, + "step": 2261 + }, + { + "epoch": 1.0253853127833181, + "grad_norm": 1.6648973396767386, + "learning_rate": 7.733935968305968e-05, + "loss": 0.8828, + "step": 2262 + }, + { + "epoch": 1.0258386219401632, + "grad_norm": 1.2387574746255279, + "learning_rate": 7.733481813352123e-05, + "loss": 0.9062, + "step": 2263 + }, + { + "epoch": 1.0262919310970082, + "grad_norm": 2.4416915668388617, + "learning_rate": 7.733027284479102e-05, + "loss": 0.9199, + "step": 2264 + }, + { + "epoch": 1.026745240253853, + "grad_norm": 2.232178516225162, + "learning_rate": 7.732572381732428e-05, + "loss": 0.9186, + "step": 2265 + }, + { + "epoch": 1.027198549410698, + "grad_norm": 1.5151712264483101, + "learning_rate": 7.73211710515766e-05, + "loss": 0.8894, + "step": 2266 + }, + { + "epoch": 1.0276518585675432, + "grad_norm": 1.709864962692262, + "learning_rate": 7.731661454800396e-05, + "loss": 0.911, + "step": 2267 + }, + { + "epoch": 1.028105167724388, + "grad_norm": 1.5576594195788114, + "learning_rate": 7.731205430706269e-05, + "loss": 0.8991, + "step": 2268 + }, + { + "epoch": 1.028558476881233, + "grad_norm": 1.3548723227923902, + "learning_rate": 7.730749032920954e-05, + "loss": 0.9018, + "step": 2269 + }, + { + "epoch": 1.0290117860380779, + "grad_norm": 2094.400847675909, + "learning_rate": 7.730292261490156e-05, + "loss": 5.1744, + "step": 2270 + }, + { + "epoch": 1.029465095194923, + "grad_norm": 2347.1983543794768, + "learning_rate": 7.729835116459628e-05, + "loss": 19.7327, + "step": 2271 + }, + { + "epoch": 1.029918404351768, + "grad_norm": 484.90331737775756, + "learning_rate": 7.729377597875149e-05, + "loss": 13.7732, + "step": 2272 + }, + { + "epoch": 1.0303717135086128, + "grad_norm": 876.832268778176, + "learning_rate": 7.728919705782543e-05, + "loss": 8.9639, + "step": 2273 + }, + { + "epoch": 1.0308250226654578, + "grad_norm": 99.49642660724203, + "learning_rate": 7.728461440227671e-05, + "loss": 9.9606, + "step": 2274 + }, + { + "epoch": 1.0312783318223029, + "grad_norm": 85.27771864859518, + "learning_rate": 7.728002801256428e-05, + "loss": 8.7004, + "step": 2275 + }, + { + "epoch": 1.0317316409791477, + "grad_norm": 78.2805370924912, + "learning_rate": 7.727543788914748e-05, + "loss": 10.0065, + "step": 2276 + }, + { + "epoch": 1.0321849501359928, + "grad_norm": 23.41223561039528, + "learning_rate": 7.7270844032486e-05, + "loss": 8.0816, + "step": 2277 + }, + { + "epoch": 1.0326382592928378, + "grad_norm": 20.175729082986933, + "learning_rate": 7.726624644303998e-05, + "loss": 7.3153, + "step": 2278 + }, + { + "epoch": 1.0330915684496826, + "grad_norm": 9.75742066020185, + "learning_rate": 7.726164512126986e-05, + "loss": 6.7263, + "step": 2279 + }, + { + "epoch": 1.0335448776065277, + "grad_norm": 17.648493843880793, + "learning_rate": 7.725704006763645e-05, + "loss": 6.8283, + "step": 2280 + }, + { + "epoch": 1.0339981867633725, + "grad_norm": 26.638731568393787, + "learning_rate": 7.7252431282601e-05, + "loss": 6.7527, + "step": 2281 + }, + { + "epoch": 1.0344514959202176, + "grad_norm": 23.206981693976875, + "learning_rate": 7.724781876662506e-05, + "loss": 7.704, + "step": 2282 + }, + { + "epoch": 1.0349048050770626, + "grad_norm": 12.788331990931729, + "learning_rate": 7.72432025201706e-05, + "loss": 6.7132, + "step": 2283 + }, + { + "epoch": 1.0353581142339074, + "grad_norm": 39.03643047084416, + "learning_rate": 7.723858254369996e-05, + "loss": 7.7887, + "step": 2284 + }, + { + "epoch": 1.0358114233907525, + "grad_norm": 32.76241818914737, + "learning_rate": 7.723395883767584e-05, + "loss": 7.7046, + "step": 2285 + }, + { + "epoch": 1.0362647325475975, + "grad_norm": 24.36971825459457, + "learning_rate": 7.72293314025613e-05, + "loss": 6.8577, + "step": 2286 + }, + { + "epoch": 1.0367180417044424, + "grad_norm": 10.303752047838843, + "learning_rate": 7.722470023881981e-05, + "loss": 6.682, + "step": 2287 + }, + { + "epoch": 1.0371713508612874, + "grad_norm": 12.555851035553703, + "learning_rate": 7.72200653469152e-05, + "loss": 6.6666, + "step": 2288 + }, + { + "epoch": 1.0376246600181325, + "grad_norm": 8.807971772600515, + "learning_rate": 7.721542672731165e-05, + "loss": 6.5208, + "step": 2289 + }, + { + "epoch": 1.0380779691749773, + "grad_norm": 22.32535207919814, + "learning_rate": 7.721078438047374e-05, + "loss": 6.6746, + "step": 2290 + }, + { + "epoch": 1.0385312783318223, + "grad_norm": 14.38124906748772, + "learning_rate": 7.72061383068664e-05, + "loss": 6.4662, + "step": 2291 + }, + { + "epoch": 1.0389845874886672, + "grad_norm": 7.568909580713177, + "learning_rate": 7.720148850695499e-05, + "loss": 6.4102, + "step": 2292 + }, + { + "epoch": 1.0394378966455122, + "grad_norm": 7.226379433606147, + "learning_rate": 7.719683498120515e-05, + "loss": 6.4042, + "step": 2293 + }, + { + "epoch": 1.0398912058023573, + "grad_norm": 3.2910504108242793, + "learning_rate": 7.719217773008297e-05, + "loss": 6.2012, + "step": 2294 + }, + { + "epoch": 1.040344514959202, + "grad_norm": 21.57453009949906, + "learning_rate": 7.71875167540549e-05, + "loss": 6.6328, + "step": 2295 + }, + { + "epoch": 1.0407978241160472, + "grad_norm": 22.385754574956085, + "learning_rate": 7.718285205358773e-05, + "loss": 6.6894, + "step": 2296 + }, + { + "epoch": 1.0412511332728922, + "grad_norm": 5.945513098375774, + "learning_rate": 7.717818362914863e-05, + "loss": 6.2616, + "step": 2297 + }, + { + "epoch": 1.041704442429737, + "grad_norm": 8.173923801414059, + "learning_rate": 7.717351148120518e-05, + "loss": 6.2885, + "step": 2298 + }, + { + "epoch": 1.042157751586582, + "grad_norm": 9.016759697090615, + "learning_rate": 7.716883561022532e-05, + "loss": 6.2979, + "step": 2299 + }, + { + "epoch": 1.0426110607434271, + "grad_norm": 9.266474932908805, + "learning_rate": 7.716415601667733e-05, + "loss": 6.2237, + "step": 2300 + }, + { + "epoch": 1.043064369900272, + "grad_norm": 6.767329605633009, + "learning_rate": 7.715947270102989e-05, + "loss": 6.1056, + "step": 2301 + }, + { + "epoch": 1.043517679057117, + "grad_norm": 5.246256794767085, + "learning_rate": 7.715478566375205e-05, + "loss": 6.0562, + "step": 2302 + }, + { + "epoch": 1.0439709882139618, + "grad_norm": 7.186198752310072, + "learning_rate": 7.715009490531323e-05, + "loss": 6.0726, + "step": 2303 + }, + { + "epoch": 1.0444242973708069, + "grad_norm": 5.689662293806424, + "learning_rate": 7.714540042618321e-05, + "loss": 5.994, + "step": 2304 + }, + { + "epoch": 1.044877606527652, + "grad_norm": 6.308013738728765, + "learning_rate": 7.714070222683218e-05, + "loss": 5.9721, + "step": 2305 + }, + { + "epoch": 1.0453309156844968, + "grad_norm": 6.862451968431989, + "learning_rate": 7.713600030773068e-05, + "loss": 6.0071, + "step": 2306 + }, + { + "epoch": 1.0457842248413418, + "grad_norm": 4.8252196554993, + "learning_rate": 7.71312946693496e-05, + "loss": 5.9203, + "step": 2307 + }, + { + "epoch": 1.0462375339981869, + "grad_norm": 3.9848829782143635, + "learning_rate": 7.712658531216022e-05, + "loss": 5.9275, + "step": 2308 + }, + { + "epoch": 1.0466908431550317, + "grad_norm": 4.057493290774481, + "learning_rate": 7.71218722366342e-05, + "loss": 5.8273, + "step": 2309 + }, + { + "epoch": 1.0471441523118767, + "grad_norm": 3.0565916948383385, + "learning_rate": 7.71171554432436e-05, + "loss": 5.8217, + "step": 2310 + }, + { + "epoch": 1.0475974614687216, + "grad_norm": 4.353616486361989, + "learning_rate": 7.71124349324608e-05, + "loss": 5.766, + "step": 2311 + }, + { + "epoch": 1.0480507706255666, + "grad_norm": 3.9955243508303635, + "learning_rate": 7.710771070475856e-05, + "loss": 5.7621, + "step": 2312 + }, + { + "epoch": 1.0485040797824117, + "grad_norm": 4.112359729899903, + "learning_rate": 7.710298276061004e-05, + "loss": 5.7338, + "step": 2313 + }, + { + "epoch": 1.0489573889392565, + "grad_norm": 3.23379318534331, + "learning_rate": 7.709825110048874e-05, + "loss": 5.6867, + "step": 2314 + }, + { + "epoch": 1.0494106980961015, + "grad_norm": 2.810271897670167, + "learning_rate": 7.709351572486857e-05, + "loss": 5.6649, + "step": 2315 + }, + { + "epoch": 1.0498640072529466, + "grad_norm": 3.276479968147849, + "learning_rate": 7.708877663422379e-05, + "loss": 5.6716, + "step": 2316 + }, + { + "epoch": 1.0503173164097914, + "grad_norm": 3.865306941823564, + "learning_rate": 7.708403382902902e-05, + "loss": 5.6195, + "step": 2317 + }, + { + "epoch": 1.0507706255666365, + "grad_norm": 4.386260701428785, + "learning_rate": 7.707928730975929e-05, + "loss": 5.6135, + "step": 2318 + }, + { + "epoch": 1.0512239347234815, + "grad_norm": 4.133733879646934, + "learning_rate": 7.707453707688993e-05, + "loss": 5.6432, + "step": 2319 + }, + { + "epoch": 1.0516772438803264, + "grad_norm": 6.517596080729875, + "learning_rate": 7.706978313089675e-05, + "loss": 5.6264, + "step": 2320 + }, + { + "epoch": 1.0521305530371714, + "grad_norm": 4.687582088693947, + "learning_rate": 7.706502547225584e-05, + "loss": 5.5784, + "step": 2321 + }, + { + "epoch": 1.0525838621940162, + "grad_norm": 3.181854883766817, + "learning_rate": 7.706026410144369e-05, + "loss": 5.523, + "step": 2322 + }, + { + "epoch": 1.0530371713508613, + "grad_norm": 5.174179744759722, + "learning_rate": 7.705549901893717e-05, + "loss": 5.5056, + "step": 2323 + }, + { + "epoch": 1.0534904805077063, + "grad_norm": 1.9160133338952412, + "learning_rate": 7.705073022521352e-05, + "loss": 5.4102, + "step": 2324 + }, + { + "epoch": 1.0539437896645512, + "grad_norm": 5.385273723434842, + "learning_rate": 7.704595772075035e-05, + "loss": 5.4396, + "step": 2325 + }, + { + "epoch": 1.0543970988213962, + "grad_norm": 4.002965601314066, + "learning_rate": 7.704118150602565e-05, + "loss": 5.3912, + "step": 2326 + }, + { + "epoch": 1.0548504079782413, + "grad_norm": 3.584810032026845, + "learning_rate": 7.703640158151774e-05, + "loss": 5.3343, + "step": 2327 + }, + { + "epoch": 1.055303717135086, + "grad_norm": 3.6560051107641565, + "learning_rate": 7.703161794770538e-05, + "loss": 5.3136, + "step": 2328 + }, + { + "epoch": 1.0557570262919311, + "grad_norm": 4.5830305556772215, + "learning_rate": 7.702683060506763e-05, + "loss": 5.3429, + "step": 2329 + }, + { + "epoch": 1.056210335448776, + "grad_norm": 3.842265795416284, + "learning_rate": 7.702203955408399e-05, + "loss": 5.3285, + "step": 2330 + }, + { + "epoch": 1.056663644605621, + "grad_norm": 2.8899699162486594, + "learning_rate": 7.701724479523428e-05, + "loss": 5.2961, + "step": 2331 + }, + { + "epoch": 1.057116953762466, + "grad_norm": 4.346202073452472, + "learning_rate": 7.70124463289987e-05, + "loss": 5.2977, + "step": 2332 + }, + { + "epoch": 1.0575702629193109, + "grad_norm": 3.4558992101620363, + "learning_rate": 7.700764415585786e-05, + "loss": 5.2648, + "step": 2333 + }, + { + "epoch": 1.058023572076156, + "grad_norm": 5.035945977295431, + "learning_rate": 7.700283827629269e-05, + "loss": 5.1828, + "step": 2334 + }, + { + "epoch": 1.058476881233001, + "grad_norm": 3.1414050214624245, + "learning_rate": 7.699802869078453e-05, + "loss": 5.1401, + "step": 2335 + }, + { + "epoch": 1.0589301903898458, + "grad_norm": 2.427845230213013, + "learning_rate": 7.699321539981504e-05, + "loss": 5.1136, + "step": 2336 + }, + { + "epoch": 1.0593834995466909, + "grad_norm": 6.044614966963591, + "learning_rate": 7.698839840386631e-05, + "loss": 5.2149, + "step": 2337 + }, + { + "epoch": 1.059836808703536, + "grad_norm": 4.802677519756708, + "learning_rate": 7.698357770342078e-05, + "loss": 5.099, + "step": 2338 + }, + { + "epoch": 1.0602901178603807, + "grad_norm": 7.454029054108183, + "learning_rate": 7.697875329896125e-05, + "loss": 5.203, + "step": 2339 + }, + { + "epoch": 1.0607434270172258, + "grad_norm": 4.265031459287632, + "learning_rate": 7.697392519097088e-05, + "loss": 5.0654, + "step": 2340 + }, + { + "epoch": 1.0611967361740706, + "grad_norm": 6.179294367563578, + "learning_rate": 7.696909337993326e-05, + "loss": 5.0645, + "step": 2341 + }, + { + "epoch": 1.0616500453309157, + "grad_norm": 6.38213675273815, + "learning_rate": 7.696425786633228e-05, + "loss": 5.1571, + "step": 2342 + }, + { + "epoch": 1.0621033544877607, + "grad_norm": 2.6829234602743792, + "learning_rate": 7.695941865065225e-05, + "loss": 4.9171, + "step": 2343 + }, + { + "epoch": 1.0625566636446055, + "grad_norm": 4.383582946138513, + "learning_rate": 7.695457573337781e-05, + "loss": 4.9873, + "step": 2344 + }, + { + "epoch": 1.0630099728014506, + "grad_norm": 3.385949007010201, + "learning_rate": 7.694972911499399e-05, + "loss": 4.9172, + "step": 2345 + }, + { + "epoch": 1.0634632819582956, + "grad_norm": 5.6134277605691345, + "learning_rate": 7.694487879598623e-05, + "loss": 4.9417, + "step": 2346 + }, + { + "epoch": 1.0639165911151405, + "grad_norm": 3.520010106284252, + "learning_rate": 7.694002477684025e-05, + "loss": 4.9082, + "step": 2347 + }, + { + "epoch": 1.0643699002719855, + "grad_norm": 3.034377899870042, + "learning_rate": 7.693516705804225e-05, + "loss": 4.8911, + "step": 2348 + }, + { + "epoch": 1.0648232094288304, + "grad_norm": 4.825325152016642, + "learning_rate": 7.693030564007871e-05, + "loss": 4.9004, + "step": 2349 + }, + { + "epoch": 1.0652765185856754, + "grad_norm": 4.098330200828925, + "learning_rate": 7.692544052343654e-05, + "loss": 4.8361, + "step": 2350 + }, + { + "epoch": 1.0657298277425205, + "grad_norm": 2.8501916794739115, + "learning_rate": 7.692057170860296e-05, + "loss": 4.8928, + "step": 2351 + }, + { + "epoch": 1.0661831368993653, + "grad_norm": 3.481166902992882, + "learning_rate": 7.691569919606562e-05, + "loss": 4.8165, + "step": 2352 + }, + { + "epoch": 1.0666364460562103, + "grad_norm": 4.548356729234137, + "learning_rate": 7.69108229863125e-05, + "loss": 4.801, + "step": 2353 + }, + { + "epoch": 1.0670897552130554, + "grad_norm": 2.8404612214771103, + "learning_rate": 7.6905943079832e-05, + "loss": 4.754, + "step": 2354 + }, + { + "epoch": 1.0675430643699002, + "grad_norm": 4.406614412457609, + "learning_rate": 7.690105947711284e-05, + "loss": 4.7807, + "step": 2355 + }, + { + "epoch": 1.0679963735267453, + "grad_norm": 2.975650072219128, + "learning_rate": 7.68961721786441e-05, + "loss": 4.7435, + "step": 2356 + }, + { + "epoch": 1.0684496826835903, + "grad_norm": 3.2224576890140755, + "learning_rate": 7.689128118491528e-05, + "loss": 4.689, + "step": 2357 + }, + { + "epoch": 1.0689029918404351, + "grad_norm": 4.49404226118954, + "learning_rate": 7.688638649641625e-05, + "loss": 4.7083, + "step": 2358 + }, + { + "epoch": 1.0693563009972802, + "grad_norm": 3.4028010455065383, + "learning_rate": 7.68814881136372e-05, + "loss": 4.6854, + "step": 2359 + }, + { + "epoch": 1.069809610154125, + "grad_norm": 3.417154333709478, + "learning_rate": 7.687658603706871e-05, + "loss": 4.6616, + "step": 2360 + }, + { + "epoch": 1.07026291931097, + "grad_norm": 3.109779214135758, + "learning_rate": 7.687168026720176e-05, + "loss": 4.6825, + "step": 2361 + }, + { + "epoch": 1.0707162284678151, + "grad_norm": 5.144983619192727, + "learning_rate": 7.686677080452766e-05, + "loss": 4.6557, + "step": 2362 + }, + { + "epoch": 1.07116953762466, + "grad_norm": 2.647248585452473, + "learning_rate": 7.686185764953812e-05, + "loss": 4.559, + "step": 2363 + }, + { + "epoch": 1.071622846781505, + "grad_norm": 6.056998150479192, + "learning_rate": 7.68569408027252e-05, + "loss": 4.6205, + "step": 2364 + }, + { + "epoch": 1.07207615593835, + "grad_norm": 2.9256785892477324, + "learning_rate": 7.685202026458134e-05, + "loss": 4.5948, + "step": 2365 + }, + { + "epoch": 1.0725294650951949, + "grad_norm": 2.9901784413984482, + "learning_rate": 7.684709603559935e-05, + "loss": 4.5207, + "step": 2366 + }, + { + "epoch": 1.07298277425204, + "grad_norm": 3.9674019615395966, + "learning_rate": 7.684216811627238e-05, + "loss": 4.5315, + "step": 2367 + }, + { + "epoch": 1.0734360834088847, + "grad_norm": 2.972142747724148, + "learning_rate": 7.683723650709402e-05, + "loss": 4.5386, + "step": 2368 + }, + { + "epoch": 1.0738893925657298, + "grad_norm": 3.9312147811492673, + "learning_rate": 7.683230120855815e-05, + "loss": 4.5, + "step": 2369 + }, + { + "epoch": 1.0743427017225748, + "grad_norm": 2.6374426720542585, + "learning_rate": 7.682736222115907e-05, + "loss": 4.4408, + "step": 2370 + }, + { + "epoch": 1.0747960108794197, + "grad_norm": 4.274885092364638, + "learning_rate": 7.682241954539142e-05, + "loss": 4.4518, + "step": 2371 + }, + { + "epoch": 1.0752493200362647, + "grad_norm": 4.19526951150165, + "learning_rate": 7.681747318175025e-05, + "loss": 4.3835, + "step": 2372 + }, + { + "epoch": 1.0757026291931098, + "grad_norm": 2.3921944764250997, + "learning_rate": 7.681252313073092e-05, + "loss": 4.3843, + "step": 2373 + }, + { + "epoch": 1.0761559383499546, + "grad_norm": 4.810701961157943, + "learning_rate": 7.680756939282922e-05, + "loss": 4.3948, + "step": 2374 + }, + { + "epoch": 1.0766092475067996, + "grad_norm": 3.131931733551208, + "learning_rate": 7.680261196854126e-05, + "loss": 4.3439, + "step": 2375 + }, + { + "epoch": 1.0770625566636447, + "grad_norm": 4.569001551357258, + "learning_rate": 7.679765085836356e-05, + "loss": 4.3168, + "step": 2376 + }, + { + "epoch": 1.0775158658204895, + "grad_norm": 2.9895574266777274, + "learning_rate": 7.679268606279297e-05, + "loss": 4.3037, + "step": 2377 + }, + { + "epoch": 1.0779691749773346, + "grad_norm": 3.8921834062998424, + "learning_rate": 7.678771758232675e-05, + "loss": 4.2744, + "step": 2378 + }, + { + "epoch": 1.0784224841341796, + "grad_norm": 2.6288390211254096, + "learning_rate": 7.678274541746248e-05, + "loss": 4.2429, + "step": 2379 + }, + { + "epoch": 1.0788757932910245, + "grad_norm": 3.208514566156114, + "learning_rate": 7.677776956869817e-05, + "loss": 4.2045, + "step": 2380 + }, + { + "epoch": 1.0793291024478695, + "grad_norm": 4.530669103126331, + "learning_rate": 7.677279003653214e-05, + "loss": 4.1915, + "step": 2381 + }, + { + "epoch": 1.0797824116047143, + "grad_norm": 4.665701391689266, + "learning_rate": 7.676780682146312e-05, + "loss": 4.2446, + "step": 2382 + }, + { + "epoch": 1.0802357207615594, + "grad_norm": 4.181863311691966, + "learning_rate": 7.676281992399019e-05, + "loss": 4.1811, + "step": 2383 + }, + { + "epoch": 1.0806890299184044, + "grad_norm": 4.697938501375869, + "learning_rate": 7.67578293446128e-05, + "loss": 4.1898, + "step": 2384 + }, + { + "epoch": 1.0811423390752493, + "grad_norm": 3.407601097260339, + "learning_rate": 7.675283508383077e-05, + "loss": 4.1687, + "step": 2385 + }, + { + "epoch": 1.0815956482320943, + "grad_norm": 4.246301736407681, + "learning_rate": 7.674783714214429e-05, + "loss": 4.0947, + "step": 2386 + }, + { + "epoch": 1.0820489573889394, + "grad_norm": 3.3101422462549372, + "learning_rate": 7.674283552005392e-05, + "loss": 4.0747, + "step": 2387 + }, + { + "epoch": 1.0825022665457842, + "grad_norm": 3.946113429489755, + "learning_rate": 7.673783021806061e-05, + "loss": 4.0114, + "step": 2388 + }, + { + "epoch": 1.0829555757026292, + "grad_norm": 3.4848023400151082, + "learning_rate": 7.673282123666561e-05, + "loss": 3.9321, + "step": 2389 + }, + { + "epoch": 1.083408884859474, + "grad_norm": 4.972567838511231, + "learning_rate": 7.67278085763706e-05, + "loss": 4.0086, + "step": 2390 + }, + { + "epoch": 1.0838621940163191, + "grad_norm": 3.130310871185009, + "learning_rate": 7.672279223767764e-05, + "loss": 3.9045, + "step": 2391 + }, + { + "epoch": 1.0843155031731642, + "grad_norm": 4.802960928236241, + "learning_rate": 7.67177722210891e-05, + "loss": 3.9087, + "step": 2392 + }, + { + "epoch": 1.084768812330009, + "grad_norm": 5.202314546415333, + "learning_rate": 7.671274852710776e-05, + "loss": 3.8298, + "step": 2393 + }, + { + "epoch": 1.085222121486854, + "grad_norm": 3.7508633615579163, + "learning_rate": 7.670772115623676e-05, + "loss": 3.78, + "step": 2394 + }, + { + "epoch": 1.085675430643699, + "grad_norm": 6.304773991396164, + "learning_rate": 7.67026901089796e-05, + "loss": 3.8618, + "step": 2395 + }, + { + "epoch": 1.086128739800544, + "grad_norm": 6.390770431787671, + "learning_rate": 7.669765538584016e-05, + "loss": 3.7276, + "step": 2396 + }, + { + "epoch": 1.086582048957389, + "grad_norm": 3.5866551818913925, + "learning_rate": 7.669261698732269e-05, + "loss": 3.6477, + "step": 2397 + }, + { + "epoch": 1.087035358114234, + "grad_norm": 4.538345949554096, + "learning_rate": 7.668757491393179e-05, + "loss": 3.4833, + "step": 2398 + }, + { + "epoch": 1.0874886672710788, + "grad_norm": 5.779064664963691, + "learning_rate": 7.668252916617242e-05, + "loss": 3.2762, + "step": 2399 + }, + { + "epoch": 1.087941976427924, + "grad_norm": 7.5630192933270015, + "learning_rate": 7.667747974454996e-05, + "loss": 3.1313, + "step": 2400 + }, + { + "epoch": 1.0883952855847687, + "grad_norm": 8.763467322067829, + "learning_rate": 7.667242664957011e-05, + "loss": 2.7427, + "step": 2401 + }, + { + "epoch": 1.0888485947416138, + "grad_norm": 146.8084897121597, + "learning_rate": 7.666736988173897e-05, + "loss": 3.0558, + "step": 2402 + }, + { + "epoch": 1.0893019038984588, + "grad_norm": 17.93007801953832, + "learning_rate": 7.666230944156296e-05, + "loss": 3.3578, + "step": 2403 + }, + { + "epoch": 1.0897552130553037, + "grad_norm": 21.370476910915702, + "learning_rate": 7.665724532954889e-05, + "loss": 3.5115, + "step": 2404 + }, + { + "epoch": 1.0902085222121487, + "grad_norm": 73.94842449206689, + "learning_rate": 7.665217754620399e-05, + "loss": 4.7996, + "step": 2405 + }, + { + "epoch": 1.0906618313689938, + "grad_norm": 32.224541415865374, + "learning_rate": 7.664710609203578e-05, + "loss": 4.525, + "step": 2406 + }, + { + "epoch": 1.0911151405258386, + "grad_norm": 25.470087910366463, + "learning_rate": 7.66420309675522e-05, + "loss": 3.5655, + "step": 2407 + }, + { + "epoch": 1.0915684496826836, + "grad_norm": 9.028878238788725, + "learning_rate": 7.663695217326153e-05, + "loss": 2.8089, + "step": 2408 + }, + { + "epoch": 1.0920217588395285, + "grad_norm": 10.874227060185458, + "learning_rate": 7.663186970967242e-05, + "loss": 2.3529, + "step": 2409 + }, + { + "epoch": 1.0924750679963735, + "grad_norm": 3.746355789449805, + "learning_rate": 7.662678357729389e-05, + "loss": 1.7036, + "step": 2410 + }, + { + "epoch": 1.0929283771532186, + "grad_norm": 3.0732286491000003, + "learning_rate": 7.662169377663535e-05, + "loss": 1.415, + "step": 2411 + }, + { + "epoch": 1.0933816863100634, + "grad_norm": 2.195814918145043, + "learning_rate": 7.661660030820656e-05, + "loss": 1.2601, + "step": 2412 + }, + { + "epoch": 1.0938349954669084, + "grad_norm": 1.9478671034936976, + "learning_rate": 7.661150317251762e-05, + "loss": 1.1993, + "step": 2413 + }, + { + "epoch": 1.0942883046237535, + "grad_norm": 1.391282981818816, + "learning_rate": 7.660640237007905e-05, + "loss": 1.1046, + "step": 2414 + }, + { + "epoch": 1.0947416137805983, + "grad_norm": 2.332372771745188, + "learning_rate": 7.660129790140169e-05, + "loss": 1.0679, + "step": 2415 + }, + { + "epoch": 1.0951949229374434, + "grad_norm": 3.2810543061801094, + "learning_rate": 7.659618976699678e-05, + "loss": 1.0904, + "step": 2416 + }, + { + "epoch": 1.0956482320942884, + "grad_norm": 4.783434866604042, + "learning_rate": 7.659107796737591e-05, + "loss": 1.1597, + "step": 2417 + }, + { + "epoch": 1.0961015412511332, + "grad_norm": 1.4169900057492881, + "learning_rate": 7.658596250305104e-05, + "loss": 1.0635, + "step": 2418 + }, + { + "epoch": 1.0965548504079783, + "grad_norm": 2.129068635451705, + "learning_rate": 7.658084337453449e-05, + "loss": 1.0603, + "step": 2419 + }, + { + "epoch": 1.0970081595648231, + "grad_norm": 1.452141928685363, + "learning_rate": 7.657572058233899e-05, + "loss": 1.0662, + "step": 2420 + }, + { + "epoch": 1.0974614687216682, + "grad_norm": 3.4300726820737903, + "learning_rate": 7.657059412697756e-05, + "loss": 1.1361, + "step": 2421 + }, + { + "epoch": 1.0979147778785132, + "grad_norm": 2.3886914950271874, + "learning_rate": 7.656546400896367e-05, + "loss": 1.0868, + "step": 2422 + }, + { + "epoch": 1.098368087035358, + "grad_norm": 1.092510608795054, + "learning_rate": 7.656033022881108e-05, + "loss": 1.0606, + "step": 2423 + }, + { + "epoch": 1.098821396192203, + "grad_norm": 0.9375230914908772, + "learning_rate": 7.655519278703396e-05, + "loss": 1.0357, + "step": 2424 + }, + { + "epoch": 1.0992747053490481, + "grad_norm": 1.139610822744927, + "learning_rate": 7.655005168414686e-05, + "loss": 1.0064, + "step": 2425 + }, + { + "epoch": 1.099728014505893, + "grad_norm": 0.8332695051057878, + "learning_rate": 7.654490692066467e-05, + "loss": 0.9903, + "step": 2426 + }, + { + "epoch": 1.100181323662738, + "grad_norm": 1.2814771613044904, + "learning_rate": 7.653975849710264e-05, + "loss": 1.0036, + "step": 2427 + }, + { + "epoch": 1.1006346328195828, + "grad_norm": 1.4089558691819342, + "learning_rate": 7.653460641397642e-05, + "loss": 1.0381, + "step": 2428 + }, + { + "epoch": 1.101087941976428, + "grad_norm": 1.3099162445043189, + "learning_rate": 7.652945067180199e-05, + "loss": 0.9961, + "step": 2429 + }, + { + "epoch": 1.101541251133273, + "grad_norm": 1.1267567021261842, + "learning_rate": 7.652429127109572e-05, + "loss": 1.024, + "step": 2430 + }, + { + "epoch": 1.1019945602901178, + "grad_norm": 1.5904055784862956, + "learning_rate": 7.651912821237431e-05, + "loss": 0.9913, + "step": 2431 + }, + { + "epoch": 1.1024478694469628, + "grad_norm": 1.2487516800208966, + "learning_rate": 7.65139614961549e-05, + "loss": 0.9597, + "step": 2432 + }, + { + "epoch": 1.1029011786038079, + "grad_norm": 1.959454225873341, + "learning_rate": 7.650879112295494e-05, + "loss": 0.9849, + "step": 2433 + }, + { + "epoch": 1.1033544877606527, + "grad_norm": 1.5929326700362598, + "learning_rate": 7.650361709329226e-05, + "loss": 0.975, + "step": 2434 + }, + { + "epoch": 1.1038077969174978, + "grad_norm": 1.8047963560034164, + "learning_rate": 7.649843940768503e-05, + "loss": 0.9681, + "step": 2435 + }, + { + "epoch": 1.1042611060743428, + "grad_norm": 1.5581133083443275, + "learning_rate": 7.649325806665183e-05, + "loss": 0.9457, + "step": 2436 + }, + { + "epoch": 1.1047144152311876, + "grad_norm": 1.5609699840842997, + "learning_rate": 7.648807307071158e-05, + "loss": 0.9635, + "step": 2437 + }, + { + "epoch": 1.1051677243880327, + "grad_norm": 2.0459035237511713, + "learning_rate": 7.648288442038357e-05, + "loss": 1.016, + "step": 2438 + }, + { + "epoch": 1.1056210335448775, + "grad_norm": 1.3571992925540473, + "learning_rate": 7.647769211618747e-05, + "loss": 0.9861, + "step": 2439 + }, + { + "epoch": 1.1060743427017226, + "grad_norm": 0.990632716941994, + "learning_rate": 7.647249615864331e-05, + "loss": 1.0008, + "step": 2440 + }, + { + "epoch": 1.1065276518585676, + "grad_norm": 1.4706179534010668, + "learning_rate": 7.646729654827145e-05, + "loss": 0.9701, + "step": 2441 + }, + { + "epoch": 1.1069809610154124, + "grad_norm": 0.8910500972462858, + "learning_rate": 7.646209328559268e-05, + "loss": 0.9682, + "step": 2442 + }, + { + "epoch": 1.1074342701722575, + "grad_norm": 1.4283472784253217, + "learning_rate": 7.645688637112811e-05, + "loss": 0.9548, + "step": 2443 + }, + { + "epoch": 1.1078875793291025, + "grad_norm": 1.1043942541295964, + "learning_rate": 7.645167580539922e-05, + "loss": 0.9437, + "step": 2444 + }, + { + "epoch": 1.1083408884859474, + "grad_norm": 1.8738190408695377, + "learning_rate": 7.644646158892785e-05, + "loss": 0.9316, + "step": 2445 + }, + { + "epoch": 1.1087941976427924, + "grad_norm": 1.4746900057808436, + "learning_rate": 7.644124372223626e-05, + "loss": 0.971, + "step": 2446 + }, + { + "epoch": 1.1092475067996372, + "grad_norm": 1.1493014519251057, + "learning_rate": 7.643602220584701e-05, + "loss": 0.9264, + "step": 2447 + }, + { + "epoch": 1.1097008159564823, + "grad_norm": 1.3187134610505222, + "learning_rate": 7.643079704028304e-05, + "loss": 0.9422, + "step": 2448 + }, + { + "epoch": 1.1101541251133273, + "grad_norm": 1.1095596928438682, + "learning_rate": 7.64255682260677e-05, + "loss": 0.9392, + "step": 2449 + }, + { + "epoch": 1.1106074342701722, + "grad_norm": 1.4266491678899524, + "learning_rate": 7.642033576372463e-05, + "loss": 0.9425, + "step": 2450 + }, + { + "epoch": 1.1110607434270172, + "grad_norm": 1.009834238705206, + "learning_rate": 7.64150996537779e-05, + "loss": 0.9155, + "step": 2451 + }, + { + "epoch": 1.1115140525838623, + "grad_norm": 1.433696949184942, + "learning_rate": 7.640985989675191e-05, + "loss": 0.9407, + "step": 2452 + }, + { + "epoch": 1.111967361740707, + "grad_norm": 1.0666704419903505, + "learning_rate": 7.640461649317146e-05, + "loss": 0.9336, + "step": 2453 + }, + { + "epoch": 1.1124206708975521, + "grad_norm": 1.2984760240676392, + "learning_rate": 7.639936944356168e-05, + "loss": 0.9417, + "step": 2454 + }, + { + "epoch": 1.1128739800543972, + "grad_norm": 1.1918224594598372, + "learning_rate": 7.639411874844806e-05, + "loss": 0.915, + "step": 2455 + }, + { + "epoch": 1.113327289211242, + "grad_norm": 1.1399726354631918, + "learning_rate": 7.638886440835649e-05, + "loss": 0.9269, + "step": 2456 + }, + { + "epoch": 1.113780598368087, + "grad_norm": 1.1259936713883603, + "learning_rate": 7.638360642381321e-05, + "loss": 0.9201, + "step": 2457 + }, + { + "epoch": 1.1142339075249321, + "grad_norm": 1.0193578753805663, + "learning_rate": 7.637834479534482e-05, + "loss": 0.9327, + "step": 2458 + }, + { + "epoch": 1.114687216681777, + "grad_norm": 1.1044841956360525, + "learning_rate": 7.637307952347828e-05, + "loss": 0.9497, + "step": 2459 + }, + { + "epoch": 1.115140525838622, + "grad_norm": 1.012558911050277, + "learning_rate": 7.636781060874092e-05, + "loss": 0.9082, + "step": 2460 + }, + { + "epoch": 1.1155938349954668, + "grad_norm": 1.4830525053645778, + "learning_rate": 7.636253805166045e-05, + "loss": 0.9208, + "step": 2461 + }, + { + "epoch": 1.1160471441523119, + "grad_norm": 1.146971627522293, + "learning_rate": 7.635726185276494e-05, + "loss": 0.9358, + "step": 2462 + }, + { + "epoch": 1.116500453309157, + "grad_norm": 1.023974811328896, + "learning_rate": 7.635198201258278e-05, + "loss": 0.9309, + "step": 2463 + }, + { + "epoch": 1.1169537624660018, + "grad_norm": 0.8935474185989489, + "learning_rate": 7.63466985316428e-05, + "loss": 0.9273, + "step": 2464 + }, + { + "epoch": 1.1174070716228468, + "grad_norm": 1.0958782202499184, + "learning_rate": 7.634141141047414e-05, + "loss": 0.9256, + "step": 2465 + }, + { + "epoch": 1.1178603807796919, + "grad_norm": 1.4887624159912314, + "learning_rate": 7.633612064960632e-05, + "loss": 0.9266, + "step": 2466 + }, + { + "epoch": 1.1183136899365367, + "grad_norm": 1.015838922047579, + "learning_rate": 7.633082624956922e-05, + "loss": 0.9111, + "step": 2467 + }, + { + "epoch": 1.1187669990933817, + "grad_norm": 0.8695679020538793, + "learning_rate": 7.63255282108931e-05, + "loss": 0.9355, + "step": 2468 + }, + { + "epoch": 1.1192203082502266, + "grad_norm": 0.8703763236121019, + "learning_rate": 7.632022653410857e-05, + "loss": 0.9207, + "step": 2469 + }, + { + "epoch": 1.1196736174070716, + "grad_norm": 0.812331056610596, + "learning_rate": 7.631492121974662e-05, + "loss": 0.9301, + "step": 2470 + }, + { + "epoch": 1.1201269265639167, + "grad_norm": 0.7181298956805691, + "learning_rate": 7.630961226833859e-05, + "loss": 0.8956, + "step": 2471 + }, + { + "epoch": 1.1205802357207615, + "grad_norm": 0.808461751198003, + "learning_rate": 7.630429968041615e-05, + "loss": 0.9347, + "step": 2472 + }, + { + "epoch": 1.1210335448776065, + "grad_norm": 1.007645175733487, + "learning_rate": 7.629898345651141e-05, + "loss": 0.9135, + "step": 2473 + }, + { + "epoch": 1.1214868540344516, + "grad_norm": 1.3499517384440476, + "learning_rate": 7.629366359715681e-05, + "loss": 0.9187, + "step": 2474 + }, + { + "epoch": 1.1219401631912964, + "grad_norm": 1.2773696189994, + "learning_rate": 7.628834010288513e-05, + "loss": 0.9335, + "step": 2475 + }, + { + "epoch": 1.1223934723481415, + "grad_norm": 1.165589796024637, + "learning_rate": 7.628301297422953e-05, + "loss": 0.9168, + "step": 2476 + }, + { + "epoch": 1.1228467815049865, + "grad_norm": 0.6914705711250302, + "learning_rate": 7.627768221172356e-05, + "loss": 0.9206, + "step": 2477 + }, + { + "epoch": 1.1233000906618313, + "grad_norm": 0.8911036953948623, + "learning_rate": 7.627234781590108e-05, + "loss": 0.9118, + "step": 2478 + }, + { + "epoch": 1.1237533998186764, + "grad_norm": 1.174356734574226, + "learning_rate": 7.626700978729637e-05, + "loss": 0.9165, + "step": 2479 + }, + { + "epoch": 1.1242067089755212, + "grad_norm": 1.4243606815522214, + "learning_rate": 7.626166812644406e-05, + "loss": 0.9031, + "step": 2480 + }, + { + "epoch": 1.1246600181323663, + "grad_norm": 1.0104136059650182, + "learning_rate": 7.62563228338791e-05, + "loss": 0.9371, + "step": 2481 + }, + { + "epoch": 1.1251133272892113, + "grad_norm": 1.3679607130559739, + "learning_rate": 7.625097391013686e-05, + "loss": 0.9216, + "step": 2482 + }, + { + "epoch": 1.1255666364460561, + "grad_norm": 0.7605776028218899, + "learning_rate": 7.624562135575305e-05, + "loss": 0.9193, + "step": 2483 + }, + { + "epoch": 1.1260199456029012, + "grad_norm": 1.097170344691278, + "learning_rate": 7.624026517126372e-05, + "loss": 0.9251, + "step": 2484 + }, + { + "epoch": 1.126473254759746, + "grad_norm": 1.3643178901497581, + "learning_rate": 7.623490535720533e-05, + "loss": 0.9204, + "step": 2485 + }, + { + "epoch": 1.126926563916591, + "grad_norm": 0.6805111504924223, + "learning_rate": 7.622954191411469e-05, + "loss": 0.9136, + "step": 2486 + }, + { + "epoch": 1.1273798730734361, + "grad_norm": 1.390267924217174, + "learning_rate": 7.622417484252893e-05, + "loss": 0.916, + "step": 2487 + }, + { + "epoch": 1.127833182230281, + "grad_norm": 0.6533919223161649, + "learning_rate": 7.621880414298562e-05, + "loss": 0.9226, + "step": 2488 + }, + { + "epoch": 1.128286491387126, + "grad_norm": 1.1872894974238155, + "learning_rate": 7.621342981602261e-05, + "loss": 0.9172, + "step": 2489 + }, + { + "epoch": 1.128739800543971, + "grad_norm": 0.9217151741251335, + "learning_rate": 7.620805186217818e-05, + "loss": 0.9202, + "step": 2490 + }, + { + "epoch": 1.1291931097008159, + "grad_norm": 0.9842007561608125, + "learning_rate": 7.620267028199095e-05, + "loss": 0.9219, + "step": 2491 + }, + { + "epoch": 1.129646418857661, + "grad_norm": 1.1846503608849615, + "learning_rate": 7.619728507599989e-05, + "loss": 0.9174, + "step": 2492 + }, + { + "epoch": 1.130099728014506, + "grad_norm": 1.472483558658305, + "learning_rate": 7.619189624474434e-05, + "loss": 0.9138, + "step": 2493 + }, + { + "epoch": 1.1305530371713508, + "grad_norm": 0.9306921584233746, + "learning_rate": 7.618650378876402e-05, + "loss": 0.9193, + "step": 2494 + }, + { + "epoch": 1.1310063463281959, + "grad_norm": 0.9633637319534175, + "learning_rate": 7.6181107708599e-05, + "loss": 0.907, + "step": 2495 + }, + { + "epoch": 1.131459655485041, + "grad_norm": 1.0971287559002065, + "learning_rate": 7.617570800478972e-05, + "loss": 0.9167, + "step": 2496 + }, + { + "epoch": 1.1319129646418857, + "grad_norm": 1.4526608956706664, + "learning_rate": 7.617030467787693e-05, + "loss": 0.9026, + "step": 2497 + }, + { + "epoch": 1.1323662737987308, + "grad_norm": 0.9475142097255123, + "learning_rate": 7.616489772840185e-05, + "loss": 0.912, + "step": 2498 + }, + { + "epoch": 1.1328195829555756, + "grad_norm": 0.8775327521121625, + "learning_rate": 7.615948715690597e-05, + "loss": 0.9284, + "step": 2499 + }, + { + "epoch": 1.1332728921124207, + "grad_norm": 1.0582659214031427, + "learning_rate": 7.615407296393119e-05, + "loss": 0.9184, + "step": 2500 + }, + { + "epoch": 1.1337262012692657, + "grad_norm": 1.5271819299876443, + "learning_rate": 7.614865515001974e-05, + "loss": 0.9231, + "step": 2501 + }, + { + "epoch": 1.1341795104261105, + "grad_norm": 1.0015893797072324, + "learning_rate": 7.614323371571424e-05, + "loss": 0.9146, + "step": 2502 + }, + { + "epoch": 1.1346328195829556, + "grad_norm": 1.2164207747997662, + "learning_rate": 7.613780866155764e-05, + "loss": 0.9297, + "step": 2503 + }, + { + "epoch": 1.1350861287398006, + "grad_norm": 0.7889813222080632, + "learning_rate": 7.613237998809333e-05, + "loss": 0.9235, + "step": 2504 + }, + { + "epoch": 1.1355394378966455, + "grad_norm": 0.7859768731989358, + "learning_rate": 7.612694769586494e-05, + "loss": 0.9322, + "step": 2505 + }, + { + "epoch": 1.1359927470534905, + "grad_norm": 0.9111834734628061, + "learning_rate": 7.612151178541659e-05, + "loss": 0.91, + "step": 2506 + }, + { + "epoch": 1.1364460562103353, + "grad_norm": 1.6831698668773063, + "learning_rate": 7.611607225729267e-05, + "loss": 0.9041, + "step": 2507 + }, + { + "epoch": 1.1368993653671804, + "grad_norm": 0.9278636638639448, + "learning_rate": 7.611062911203796e-05, + "loss": 0.91, + "step": 2508 + }, + { + "epoch": 1.1373526745240254, + "grad_norm": 1.1417805485928099, + "learning_rate": 7.610518235019761e-05, + "loss": 0.8866, + "step": 2509 + }, + { + "epoch": 1.1378059836808703, + "grad_norm": 0.8359143911587421, + "learning_rate": 7.609973197231717e-05, + "loss": 0.9157, + "step": 2510 + }, + { + "epoch": 1.1382592928377153, + "grad_norm": 0.9312356921696457, + "learning_rate": 7.609427797894244e-05, + "loss": 0.8979, + "step": 2511 + }, + { + "epoch": 1.1387126019945604, + "grad_norm": 1.022535736314734, + "learning_rate": 7.608882037061971e-05, + "loss": 0.9055, + "step": 2512 + }, + { + "epoch": 1.1391659111514052, + "grad_norm": 1.503163175442753, + "learning_rate": 7.608335914789556e-05, + "loss": 0.8992, + "step": 2513 + }, + { + "epoch": 1.1396192203082502, + "grad_norm": 1.4210352241046167, + "learning_rate": 7.607789431131693e-05, + "loss": 0.8897, + "step": 2514 + }, + { + "epoch": 1.1400725294650953, + "grad_norm": 0.5385180223230056, + "learning_rate": 7.607242586143116e-05, + "loss": 0.9154, + "step": 2515 + }, + { + "epoch": 1.1405258386219401, + "grad_norm": 1.1859766645777048, + "learning_rate": 7.606695379878594e-05, + "loss": 0.9441, + "step": 2516 + }, + { + "epoch": 1.1409791477787852, + "grad_norm": 1.6895843831205228, + "learning_rate": 7.606147812392927e-05, + "loss": 0.9107, + "step": 2517 + }, + { + "epoch": 1.1414324569356302, + "grad_norm": 0.8565771715782029, + "learning_rate": 7.60559988374096e-05, + "loss": 0.8997, + "step": 2518 + }, + { + "epoch": 1.141885766092475, + "grad_norm": 1.1209485071380931, + "learning_rate": 7.605051593977568e-05, + "loss": 0.8853, + "step": 2519 + }, + { + "epoch": 1.14233907524932, + "grad_norm": 1.343868933374682, + "learning_rate": 7.604502943157665e-05, + "loss": 0.908, + "step": 2520 + }, + { + "epoch": 1.142792384406165, + "grad_norm": 1.2485935137366657, + "learning_rate": 7.603953931336198e-05, + "loss": 0.9288, + "step": 2521 + }, + { + "epoch": 1.14324569356301, + "grad_norm": 1.3532240494401604, + "learning_rate": 7.603404558568151e-05, + "loss": 0.9002, + "step": 2522 + }, + { + "epoch": 1.143699002719855, + "grad_norm": 0.9149357600666658, + "learning_rate": 7.602854824908548e-05, + "loss": 0.9175, + "step": 2523 + }, + { + "epoch": 1.1441523118766999, + "grad_norm": 1.04936557555316, + "learning_rate": 7.602304730412447e-05, + "loss": 0.914, + "step": 2524 + }, + { + "epoch": 1.144605621033545, + "grad_norm": 0.9212649155702058, + "learning_rate": 7.601754275134938e-05, + "loss": 0.9126, + "step": 2525 + }, + { + "epoch": 1.1450589301903897, + "grad_norm": 0.932651325087658, + "learning_rate": 7.601203459131156e-05, + "loss": 0.8986, + "step": 2526 + }, + { + "epoch": 1.1455122393472348, + "grad_norm": 1.4385094334184956, + "learning_rate": 7.600652282456263e-05, + "loss": 0.9165, + "step": 2527 + }, + { + "epoch": 1.1459655485040798, + "grad_norm": 1.2077042690003126, + "learning_rate": 7.600100745165462e-05, + "loss": 0.9392, + "step": 2528 + }, + { + "epoch": 1.1464188576609247, + "grad_norm": 1.161081611810291, + "learning_rate": 7.599548847313989e-05, + "loss": 0.895, + "step": 2529 + }, + { + "epoch": 1.1468721668177697, + "grad_norm": 0.9915851674186564, + "learning_rate": 7.598996588957121e-05, + "loss": 0.9079, + "step": 2530 + }, + { + "epoch": 1.1473254759746148, + "grad_norm": 1.555088850165638, + "learning_rate": 7.598443970150167e-05, + "loss": 0.9129, + "step": 2531 + }, + { + "epoch": 1.1477787851314596, + "grad_norm": 0.8546076408452524, + "learning_rate": 7.597890990948475e-05, + "loss": 0.9151, + "step": 2532 + }, + { + "epoch": 1.1482320942883046, + "grad_norm": 0.8038005981336634, + "learning_rate": 7.597337651407426e-05, + "loss": 0.897, + "step": 2533 + }, + { + "epoch": 1.1486854034451497, + "grad_norm": 0.9638697279369236, + "learning_rate": 7.596783951582438e-05, + "loss": 0.9274, + "step": 2534 + }, + { + "epoch": 1.1491387126019945, + "grad_norm": 1.3712369069835817, + "learning_rate": 7.596229891528966e-05, + "loss": 0.9033, + "step": 2535 + }, + { + "epoch": 1.1495920217588396, + "grad_norm": 0.825817306897089, + "learning_rate": 7.595675471302503e-05, + "loss": 0.8914, + "step": 2536 + }, + { + "epoch": 1.1500453309156846, + "grad_norm": 1.363891096200731, + "learning_rate": 7.595120690958573e-05, + "loss": 0.8805, + "step": 2537 + }, + { + "epoch": 1.1504986400725294, + "grad_norm": 1.1947621466232174, + "learning_rate": 7.59456555055274e-05, + "loss": 0.9182, + "step": 2538 + }, + { + "epoch": 1.1509519492293745, + "grad_norm": 0.9677551862483756, + "learning_rate": 7.594010050140602e-05, + "loss": 0.8963, + "step": 2539 + }, + { + "epoch": 1.1514052583862193, + "grad_norm": 1.2571966850173217, + "learning_rate": 7.593454189777797e-05, + "loss": 0.8964, + "step": 2540 + }, + { + "epoch": 1.1518585675430644, + "grad_norm": 0.9208543285835852, + "learning_rate": 7.592897969519993e-05, + "loss": 0.8932, + "step": 2541 + }, + { + "epoch": 1.1523118766999094, + "grad_norm": 1.1032560853094908, + "learning_rate": 7.592341389422897e-05, + "loss": 0.9149, + "step": 2542 + }, + { + "epoch": 1.1527651858567542, + "grad_norm": 1.4800792449400988, + "learning_rate": 7.591784449542254e-05, + "loss": 0.8924, + "step": 2543 + }, + { + "epoch": 1.1532184950135993, + "grad_norm": 0.9366647358575119, + "learning_rate": 7.591227149933842e-05, + "loss": 0.9124, + "step": 2544 + }, + { + "epoch": 1.1536718041704441, + "grad_norm": 1.0088030415222509, + "learning_rate": 7.590669490653478e-05, + "loss": 0.9281, + "step": 2545 + }, + { + "epoch": 1.1541251133272892, + "grad_norm": 0.9028450410959744, + "learning_rate": 7.59011147175701e-05, + "loss": 0.9385, + "step": 2546 + }, + { + "epoch": 1.1545784224841342, + "grad_norm": 0.8259817147521045, + "learning_rate": 7.589553093300328e-05, + "loss": 0.9043, + "step": 2547 + }, + { + "epoch": 1.155031731640979, + "grad_norm": 1.4346055352371538, + "learning_rate": 7.588994355339353e-05, + "loss": 0.9151, + "step": 2548 + }, + { + "epoch": 1.155485040797824, + "grad_norm": 0.9892515695564202, + "learning_rate": 7.588435257930047e-05, + "loss": 0.9277, + "step": 2549 + }, + { + "epoch": 1.1559383499546692, + "grad_norm": 1.4684712179761417, + "learning_rate": 7.587875801128402e-05, + "loss": 0.9027, + "step": 2550 + }, + { + "epoch": 1.156391659111514, + "grad_norm": 0.7811831251307954, + "learning_rate": 7.587315984990452e-05, + "loss": 0.8945, + "step": 2551 + }, + { + "epoch": 1.156844968268359, + "grad_norm": 0.8768445419436297, + "learning_rate": 7.586755809572263e-05, + "loss": 0.9185, + "step": 2552 + }, + { + "epoch": 1.157298277425204, + "grad_norm": 1.177639924039331, + "learning_rate": 7.586195274929939e-05, + "loss": 0.8989, + "step": 2553 + }, + { + "epoch": 1.157751586582049, + "grad_norm": 1.26906065150842, + "learning_rate": 7.585634381119617e-05, + "loss": 0.9115, + "step": 2554 + }, + { + "epoch": 1.158204895738894, + "grad_norm": 1.2913527725248244, + "learning_rate": 7.585073128197474e-05, + "loss": 0.9202, + "step": 2555 + }, + { + "epoch": 1.158658204895739, + "grad_norm": 0.9481087338570369, + "learning_rate": 7.584511516219723e-05, + "loss": 0.9003, + "step": 2556 + }, + { + "epoch": 1.1591115140525838, + "grad_norm": 0.9717179926389582, + "learning_rate": 7.583949545242606e-05, + "loss": 0.905, + "step": 2557 + }, + { + "epoch": 1.1595648232094289, + "grad_norm": 1.0743527932004222, + "learning_rate": 7.583387215322412e-05, + "loss": 0.9173, + "step": 2558 + }, + { + "epoch": 1.1600181323662737, + "grad_norm": 1.55460288529018, + "learning_rate": 7.582824526515455e-05, + "loss": 0.8988, + "step": 2559 + }, + { + "epoch": 1.1604714415231188, + "grad_norm": 0.941172773993807, + "learning_rate": 7.582261478878093e-05, + "loss": 0.9227, + "step": 2560 + }, + { + "epoch": 1.1609247506799638, + "grad_norm": 1.1106782394955375, + "learning_rate": 7.581698072466715e-05, + "loss": 0.9024, + "step": 2561 + }, + { + "epoch": 1.1613780598368086, + "grad_norm": 1.1922328609266923, + "learning_rate": 7.581134307337748e-05, + "loss": 0.9008, + "step": 2562 + }, + { + "epoch": 1.1618313689936537, + "grad_norm": 1.511977561625631, + "learning_rate": 7.580570183547657e-05, + "loss": 0.8876, + "step": 2563 + }, + { + "epoch": 1.1622846781504985, + "grad_norm": 0.8459893617632162, + "learning_rate": 7.580005701152938e-05, + "loss": 0.897, + "step": 2564 + }, + { + "epoch": 1.1627379873073436, + "grad_norm": 1.043130199894637, + "learning_rate": 7.579440860210126e-05, + "loss": 0.8886, + "step": 2565 + }, + { + "epoch": 1.1631912964641886, + "grad_norm": 1.1660976348126435, + "learning_rate": 7.578875660775793e-05, + "loss": 0.9111, + "step": 2566 + }, + { + "epoch": 1.1636446056210334, + "grad_norm": 1.3563675283910377, + "learning_rate": 7.578310102906546e-05, + "loss": 0.9044, + "step": 2567 + }, + { + "epoch": 1.1640979147778785, + "grad_norm": 1.0404595819540663, + "learning_rate": 7.577744186659024e-05, + "loss": 0.8995, + "step": 2568 + }, + { + "epoch": 1.1645512239347235, + "grad_norm": 1.5086139584840645, + "learning_rate": 7.577177912089907e-05, + "loss": 0.928, + "step": 2569 + }, + { + "epoch": 1.1650045330915684, + "grad_norm": 0.5528351344081979, + "learning_rate": 7.576611279255909e-05, + "loss": 0.9045, + "step": 2570 + }, + { + "epoch": 1.1654578422484134, + "grad_norm": 1.3048300757444824, + "learning_rate": 7.57604428821378e-05, + "loss": 0.9234, + "step": 2571 + }, + { + "epoch": 1.1659111514052585, + "grad_norm": 1.2997522902374257, + "learning_rate": 7.575476939020304e-05, + "loss": 0.894, + "step": 2572 + }, + { + "epoch": 1.1663644605621033, + "grad_norm": 1.0592137658864458, + "learning_rate": 7.574909231732307e-05, + "loss": 0.9158, + "step": 2573 + }, + { + "epoch": 1.1668177697189483, + "grad_norm": 1.2537558784035714, + "learning_rate": 7.574341166406644e-05, + "loss": 0.8945, + "step": 2574 + }, + { + "epoch": 1.1672710788757934, + "grad_norm": 0.8674377825008158, + "learning_rate": 7.573772743100207e-05, + "loss": 0.9371, + "step": 2575 + }, + { + "epoch": 1.1677243880326382, + "grad_norm": 1.1560085041517394, + "learning_rate": 7.573203961869927e-05, + "loss": 0.8983, + "step": 2576 + }, + { + "epoch": 1.1681776971894833, + "grad_norm": 0.9042942486061998, + "learning_rate": 7.572634822772769e-05, + "loss": 0.9196, + "step": 2577 + }, + { + "epoch": 1.168631006346328, + "grad_norm": 1.3063598672672798, + "learning_rate": 7.572065325865733e-05, + "loss": 0.8997, + "step": 2578 + }, + { + "epoch": 1.1690843155031732, + "grad_norm": 0.9140098752941759, + "learning_rate": 7.571495471205858e-05, + "loss": 0.9024, + "step": 2579 + }, + { + "epoch": 1.1695376246600182, + "grad_norm": 0.9580370095668953, + "learning_rate": 7.570925258850213e-05, + "loss": 0.9232, + "step": 2580 + }, + { + "epoch": 1.169990933816863, + "grad_norm": 1.2970026515606206, + "learning_rate": 7.570354688855911e-05, + "loss": 0.9118, + "step": 2581 + }, + { + "epoch": 1.170444242973708, + "grad_norm": 1.2729236936409603, + "learning_rate": 7.569783761280093e-05, + "loss": 0.9033, + "step": 2582 + }, + { + "epoch": 1.1708975521305531, + "grad_norm": 0.915531121794545, + "learning_rate": 7.56921247617994e-05, + "loss": 0.9064, + "step": 2583 + }, + { + "epoch": 1.171350861287398, + "grad_norm": 0.7837210571598561, + "learning_rate": 7.568640833612666e-05, + "loss": 0.9091, + "step": 2584 + }, + { + "epoch": 1.171804170444243, + "grad_norm": 0.8459915595379177, + "learning_rate": 7.568068833635526e-05, + "loss": 0.9154, + "step": 2585 + }, + { + "epoch": 1.1722574796010878, + "grad_norm": 1.403026487243595, + "learning_rate": 7.567496476305806e-05, + "loss": 0.8976, + "step": 2586 + }, + { + "epoch": 1.1727107887579329, + "grad_norm": 1.0947341757908708, + "learning_rate": 7.566923761680828e-05, + "loss": 0.9216, + "step": 2587 + }, + { + "epoch": 1.173164097914778, + "grad_norm": 1.1444152651896877, + "learning_rate": 7.566350689817954e-05, + "loss": 0.9094, + "step": 2588 + }, + { + "epoch": 1.1736174070716228, + "grad_norm": 0.721531629063778, + "learning_rate": 7.565777260774576e-05, + "loss": 0.8845, + "step": 2589 + }, + { + "epoch": 1.1740707162284678, + "grad_norm": 0.47389687558665466, + "learning_rate": 7.565203474608126e-05, + "loss": 0.8979, + "step": 2590 + }, + { + "epoch": 1.1745240253853129, + "grad_norm": 0.7297903795478407, + "learning_rate": 7.56462933137607e-05, + "loss": 0.894, + "step": 2591 + }, + { + "epoch": 1.1749773345421577, + "grad_norm": 1.1368486100385988, + "learning_rate": 7.564054831135911e-05, + "loss": 0.9095, + "step": 2592 + }, + { + "epoch": 1.1754306436990027, + "grad_norm": 1.430234405634418, + "learning_rate": 7.563479973945186e-05, + "loss": 0.9066, + "step": 2593 + }, + { + "epoch": 1.1758839528558478, + "grad_norm": 0.8941898739479598, + "learning_rate": 7.562904759861467e-05, + "loss": 0.9166, + "step": 2594 + }, + { + "epoch": 1.1763372620126926, + "grad_norm": 1.0823467345578222, + "learning_rate": 7.562329188942366e-05, + "loss": 0.9221, + "step": 2595 + }, + { + "epoch": 1.1767905711695377, + "grad_norm": 1.0339334985644784, + "learning_rate": 7.561753261245528e-05, + "loss": 0.9211, + "step": 2596 + }, + { + "epoch": 1.1772438803263825, + "grad_norm": 1.4217404586365392, + "learning_rate": 7.561176976828632e-05, + "loss": 0.8999, + "step": 2597 + }, + { + "epoch": 1.1776971894832275, + "grad_norm": 0.8994501375759059, + "learning_rate": 7.560600335749398e-05, + "loss": 0.9206, + "step": 2598 + }, + { + "epoch": 1.1781504986400726, + "grad_norm": 1.1773280373775339, + "learning_rate": 7.560023338065574e-05, + "loss": 0.896, + "step": 2599 + }, + { + "epoch": 1.1786038077969174, + "grad_norm": 0.883565403360514, + "learning_rate": 7.559445983834951e-05, + "loss": 0.9033, + "step": 2600 + }, + { + "epoch": 1.1790571169537625, + "grad_norm": 1.200158202234469, + "learning_rate": 7.55886827311535e-05, + "loss": 0.8924, + "step": 2601 + }, + { + "epoch": 1.1795104261106075, + "grad_norm": 1.2745227279743483, + "learning_rate": 7.558290205964632e-05, + "loss": 0.9089, + "step": 2602 + }, + { + "epoch": 1.1799637352674524, + "grad_norm": 0.9728126168860493, + "learning_rate": 7.557711782440694e-05, + "loss": 0.9029, + "step": 2603 + }, + { + "epoch": 1.1804170444242974, + "grad_norm": 1.4680459273216226, + "learning_rate": 7.557133002601465e-05, + "loss": 0.8999, + "step": 2604 + }, + { + "epoch": 1.1808703535811422, + "grad_norm": 0.5381806992809064, + "learning_rate": 7.55655386650491e-05, + "loss": 0.8952, + "step": 2605 + }, + { + "epoch": 1.1813236627379873, + "grad_norm": 1.1357832029562371, + "learning_rate": 7.555974374209035e-05, + "loss": 0.8922, + "step": 2606 + }, + { + "epoch": 1.1817769718948323, + "grad_norm": 1.403966335894116, + "learning_rate": 7.555394525771874e-05, + "loss": 0.9089, + "step": 2607 + }, + { + "epoch": 1.1822302810516772, + "grad_norm": 0.8462602182312048, + "learning_rate": 7.554814321251502e-05, + "loss": 0.9206, + "step": 2608 + }, + { + "epoch": 1.1826835902085222, + "grad_norm": 1.6106037251912577, + "learning_rate": 7.554233760706028e-05, + "loss": 0.8978, + "step": 2609 + }, + { + "epoch": 1.1831368993653673, + "grad_norm": 0.8502276678738561, + "learning_rate": 7.553652844193599e-05, + "loss": 0.8999, + "step": 2610 + }, + { + "epoch": 1.183590208522212, + "grad_norm": 1.1351174253171614, + "learning_rate": 7.553071571772393e-05, + "loss": 0.9047, + "step": 2611 + }, + { + "epoch": 1.1840435176790571, + "grad_norm": 1.3217108594022002, + "learning_rate": 7.552489943500626e-05, + "loss": 0.926, + "step": 2612 + }, + { + "epoch": 1.1844968268359022, + "grad_norm": 1.4748910644756517, + "learning_rate": 7.55190795943655e-05, + "loss": 0.8684, + "step": 2613 + }, + { + "epoch": 1.184950135992747, + "grad_norm": 0.8434609778443781, + "learning_rate": 7.551325619638455e-05, + "loss": 0.9156, + "step": 2614 + }, + { + "epoch": 1.185403445149592, + "grad_norm": 1.673707476901838, + "learning_rate": 7.55074292416466e-05, + "loss": 0.9196, + "step": 2615 + }, + { + "epoch": 1.185856754306437, + "grad_norm": 0.7427058456753135, + "learning_rate": 7.550159873073527e-05, + "loss": 0.9253, + "step": 2616 + }, + { + "epoch": 1.186310063463282, + "grad_norm": 1.8470255622612186, + "learning_rate": 7.549576466423449e-05, + "loss": 0.9236, + "step": 2617 + }, + { + "epoch": 1.186763372620127, + "grad_norm": 1.0418666383818267, + "learning_rate": 7.548992704272856e-05, + "loss": 0.9145, + "step": 2618 + }, + { + "epoch": 1.1872166817769718, + "grad_norm": 2.349151745715469, + "learning_rate": 7.548408586680212e-05, + "loss": 0.8904, + "step": 2619 + }, + { + "epoch": 1.1876699909338169, + "grad_norm": 2.1347961178307138, + "learning_rate": 7.547824113704021e-05, + "loss": 0.9218, + "step": 2620 + }, + { + "epoch": 1.188123300090662, + "grad_norm": 1.2925493224659417, + "learning_rate": 7.547239285402818e-05, + "loss": 0.9318, + "step": 2621 + }, + { + "epoch": 1.1885766092475067, + "grad_norm": 1.5854116788266157, + "learning_rate": 7.546654101835174e-05, + "loss": 0.9335, + "step": 2622 + }, + { + "epoch": 1.1890299184043518, + "grad_norm": 1.2864312829118831, + "learning_rate": 7.5460685630597e-05, + "loss": 0.9008, + "step": 2623 + }, + { + "epoch": 1.1894832275611966, + "grad_norm": 1.161844792825842, + "learning_rate": 7.545482669135037e-05, + "loss": 0.9094, + "step": 2624 + }, + { + "epoch": 1.1899365367180417, + "grad_norm": 1.0650017475228541, + "learning_rate": 7.544896420119865e-05, + "loss": 0.9242, + "step": 2625 + }, + { + "epoch": 1.1903898458748867, + "grad_norm": 1.276221157246264, + "learning_rate": 7.544309816072898e-05, + "loss": 0.9086, + "step": 2626 + }, + { + "epoch": 1.1908431550317315, + "grad_norm": 0.9674619073456585, + "learning_rate": 7.543722857052885e-05, + "loss": 0.9031, + "step": 2627 + }, + { + "epoch": 1.1912964641885766, + "grad_norm": 1.1380973769767317, + "learning_rate": 7.543135543118615e-05, + "loss": 0.9094, + "step": 2628 + }, + { + "epoch": 1.1917497733454216, + "grad_norm": 0.9299169596948971, + "learning_rate": 7.542547874328906e-05, + "loss": 0.8919, + "step": 2629 + }, + { + "epoch": 1.1922030825022665, + "grad_norm": 0.9406379871441944, + "learning_rate": 7.541959850742617e-05, + "loss": 0.9068, + "step": 2630 + }, + { + "epoch": 1.1926563916591115, + "grad_norm": 0.7344804244396188, + "learning_rate": 7.54137147241864e-05, + "loss": 0.9196, + "step": 2631 + }, + { + "epoch": 1.1931097008159566, + "grad_norm": 0.8589554218091947, + "learning_rate": 7.540782739415901e-05, + "loss": 0.8818, + "step": 2632 + }, + { + "epoch": 1.1935630099728014, + "grad_norm": 0.91060012356389, + "learning_rate": 7.540193651793364e-05, + "loss": 0.9094, + "step": 2633 + }, + { + "epoch": 1.1940163191296465, + "grad_norm": 0.8699935781114383, + "learning_rate": 7.53960420961003e-05, + "loss": 0.8926, + "step": 2634 + }, + { + "epoch": 1.1944696282864915, + "grad_norm": 0.9606345747838128, + "learning_rate": 7.53901441292493e-05, + "loss": 0.9056, + "step": 2635 + }, + { + "epoch": 1.1949229374433363, + "grad_norm": 1.357214886618566, + "learning_rate": 7.538424261797136e-05, + "loss": 0.9164, + "step": 2636 + }, + { + "epoch": 1.1953762466001814, + "grad_norm": 1.202870344900987, + "learning_rate": 7.537833756285753e-05, + "loss": 0.8896, + "step": 2637 + }, + { + "epoch": 1.1958295557570262, + "grad_norm": 0.8172164533873729, + "learning_rate": 7.537242896449923e-05, + "loss": 0.9156, + "step": 2638 + }, + { + "epoch": 1.1962828649138713, + "grad_norm": 0.737042114223655, + "learning_rate": 7.536651682348819e-05, + "loss": 0.9094, + "step": 2639 + }, + { + "epoch": 1.1967361740707163, + "grad_norm": 0.7037588329150696, + "learning_rate": 7.536060114041656e-05, + "loss": 0.8872, + "step": 2640 + }, + { + "epoch": 1.1971894832275611, + "grad_norm": 0.9709339373083604, + "learning_rate": 7.535468191587682e-05, + "loss": 0.9066, + "step": 2641 + }, + { + "epoch": 1.1976427923844062, + "grad_norm": 1.5548033675403758, + "learning_rate": 7.534875915046176e-05, + "loss": 0.9166, + "step": 2642 + }, + { + "epoch": 1.198096101541251, + "grad_norm": 0.8345502512260377, + "learning_rate": 7.534283284476459e-05, + "loss": 0.9066, + "step": 2643 + }, + { + "epoch": 1.198549410698096, + "grad_norm": 0.9577566290064596, + "learning_rate": 7.533690299937883e-05, + "loss": 0.9124, + "step": 2644 + }, + { + "epoch": 1.1990027198549411, + "grad_norm": 1.1195786940244128, + "learning_rate": 7.53309696148984e-05, + "loss": 0.9122, + "step": 2645 + }, + { + "epoch": 1.199456029011786, + "grad_norm": 1.0391190932498615, + "learning_rate": 7.53250326919175e-05, + "loss": 0.9174, + "step": 2646 + }, + { + "epoch": 1.199909338168631, + "grad_norm": 1.57935944606239, + "learning_rate": 7.531909223103078e-05, + "loss": 0.91, + "step": 2647 + }, + { + "epoch": 1.200362647325476, + "grad_norm": 0.9424544460333251, + "learning_rate": 7.531314823283317e-05, + "loss": 0.9006, + "step": 2648 + }, + { + "epoch": 1.2008159564823209, + "grad_norm": 1.1632747270687085, + "learning_rate": 7.530720069791997e-05, + "loss": 0.8896, + "step": 2649 + }, + { + "epoch": 1.201269265639166, + "grad_norm": 1.027014364193908, + "learning_rate": 7.530124962688686e-05, + "loss": 0.9154, + "step": 2650 + }, + { + "epoch": 1.201722574796011, + "grad_norm": 1.8653499734524381, + "learning_rate": 7.529529502032985e-05, + "loss": 0.8892, + "step": 2651 + }, + { + "epoch": 1.2021758839528558, + "grad_norm": 0.7738775264147147, + "learning_rate": 7.528933687884533e-05, + "loss": 0.9183, + "step": 2652 + }, + { + "epoch": 1.2026291931097008, + "grad_norm": 2.2101558736675804, + "learning_rate": 7.528337520302999e-05, + "loss": 0.8964, + "step": 2653 + }, + { + "epoch": 1.203082502266546, + "grad_norm": 1.1581750053482687, + "learning_rate": 7.527740999348093e-05, + "loss": 0.9197, + "step": 2654 + }, + { + "epoch": 1.2035358114233907, + "grad_norm": 2.564208346445842, + "learning_rate": 7.527144125079558e-05, + "loss": 0.9079, + "step": 2655 + }, + { + "epoch": 1.2039891205802358, + "grad_norm": 2.1193363294595513, + "learning_rate": 7.526546897557173e-05, + "loss": 0.9098, + "step": 2656 + }, + { + "epoch": 1.2044424297370806, + "grad_norm": 2.110680313267792, + "learning_rate": 7.525949316840753e-05, + "loss": 0.901, + "step": 2657 + }, + { + "epoch": 1.2048957388939256, + "grad_norm": 1.861241694086222, + "learning_rate": 7.525351382990144e-05, + "loss": 0.9169, + "step": 2658 + }, + { + "epoch": 1.2053490480507707, + "grad_norm": 2.043669767995286, + "learning_rate": 7.524753096065235e-05, + "loss": 0.902, + "step": 2659 + }, + { + "epoch": 1.2058023572076155, + "grad_norm": 1.758584558250955, + "learning_rate": 7.524154456125943e-05, + "loss": 0.9125, + "step": 2660 + }, + { + "epoch": 1.2062556663644606, + "grad_norm": 1.8708228663301656, + "learning_rate": 7.523555463232227e-05, + "loss": 0.9066, + "step": 2661 + }, + { + "epoch": 1.2067089755213056, + "grad_norm": 1.623507384699055, + "learning_rate": 7.522956117444073e-05, + "loss": 0.9078, + "step": 2662 + }, + { + "epoch": 1.2071622846781505, + "grad_norm": 1.7582875454261735, + "learning_rate": 7.522356418821512e-05, + "loss": 0.9104, + "step": 2663 + }, + { + "epoch": 1.2076155938349955, + "grad_norm": 1.5142523014521052, + "learning_rate": 7.521756367424603e-05, + "loss": 0.9141, + "step": 2664 + }, + { + "epoch": 1.2080689029918403, + "grad_norm": 1.7421184987618896, + "learning_rate": 7.521155963313444e-05, + "loss": 0.8948, + "step": 2665 + }, + { + "epoch": 1.2085222121486854, + "grad_norm": 1.3882994635428016, + "learning_rate": 7.520555206548166e-05, + "loss": 0.8936, + "step": 2666 + }, + { + "epoch": 1.2089755213055304, + "grad_norm": 1.3250084633631882, + "learning_rate": 7.519954097188939e-05, + "loss": 0.919, + "step": 2667 + }, + { + "epoch": 1.2094288304623753, + "grad_norm": 1.5596474234799707, + "learning_rate": 7.519352635295963e-05, + "loss": 0.9017, + "step": 2668 + }, + { + "epoch": 1.2098821396192203, + "grad_norm": 1.015325568934714, + "learning_rate": 7.518750820929477e-05, + "loss": 0.8914, + "step": 2669 + }, + { + "epoch": 1.2103354487760654, + "grad_norm": 1.4806305040229892, + "learning_rate": 7.518148654149756e-05, + "loss": 0.9034, + "step": 2670 + }, + { + "epoch": 1.2107887579329102, + "grad_norm": 1.4584846208882558, + "learning_rate": 7.517546135017106e-05, + "loss": 0.9031, + "step": 2671 + }, + { + "epoch": 1.2112420670897552, + "grad_norm": 0.9064330457912367, + "learning_rate": 7.516943263591873e-05, + "loss": 0.9017, + "step": 2672 + }, + { + "epoch": 1.2116953762466003, + "grad_norm": 2.1037956380914293, + "learning_rate": 7.516340039934438e-05, + "loss": 0.8673, + "step": 2673 + }, + { + "epoch": 1.2121486854034451, + "grad_norm": 1.3498910115644396, + "learning_rate": 7.515736464105212e-05, + "loss": 0.8995, + "step": 2674 + }, + { + "epoch": 1.2126019945602902, + "grad_norm": 2.6960629357798864, + "learning_rate": 7.515132536164646e-05, + "loss": 0.8998, + "step": 2675 + }, + { + "epoch": 1.213055303717135, + "grad_norm": 2.4719238049118837, + "learning_rate": 7.514528256173227e-05, + "loss": 0.9281, + "step": 2676 + }, + { + "epoch": 1.21350861287398, + "grad_norm": 1.7857512022848434, + "learning_rate": 7.513923624191474e-05, + "loss": 0.8952, + "step": 2677 + }, + { + "epoch": 1.213961922030825, + "grad_norm": 1.559938035256402, + "learning_rate": 7.513318640279943e-05, + "loss": 0.9039, + "step": 2678 + }, + { + "epoch": 1.21441523118767, + "grad_norm": 1.8757903349320926, + "learning_rate": 7.512713304499225e-05, + "loss": 0.914, + "step": 2679 + }, + { + "epoch": 1.214868540344515, + "grad_norm": 1.368938234929664, + "learning_rate": 7.512107616909944e-05, + "loss": 0.9113, + "step": 2680 + }, + { + "epoch": 1.21532184950136, + "grad_norm": 2.2012963769539606, + "learning_rate": 7.511501577572765e-05, + "loss": 0.902, + "step": 2681 + }, + { + "epoch": 1.2157751586582048, + "grad_norm": 1.8316947084964343, + "learning_rate": 7.510895186548383e-05, + "loss": 0.9144, + "step": 2682 + }, + { + "epoch": 1.21622846781505, + "grad_norm": 1.9830067262733106, + "learning_rate": 7.510288443897528e-05, + "loss": 0.8921, + "step": 2683 + }, + { + "epoch": 1.2166817769718947, + "grad_norm": 1.7665818930151558, + "learning_rate": 7.509681349680971e-05, + "loss": 0.9314, + "step": 2684 + }, + { + "epoch": 1.2171350861287398, + "grad_norm": 2.015599181884209, + "learning_rate": 7.50907390395951e-05, + "loss": 0.8986, + "step": 2685 + }, + { + "epoch": 1.2175883952855848, + "grad_norm": 1.7375462144822016, + "learning_rate": 7.508466106793987e-05, + "loss": 0.8991, + "step": 2686 + }, + { + "epoch": 1.2180417044424297, + "grad_norm": 1.945211335903417, + "learning_rate": 7.50785795824527e-05, + "loss": 0.9024, + "step": 2687 + }, + { + "epoch": 1.2184950135992747, + "grad_norm": 1.6347855504544733, + "learning_rate": 7.507249458374271e-05, + "loss": 0.9012, + "step": 2688 + }, + { + "epoch": 1.2189483227561198, + "grad_norm": 2.1300802502252267, + "learning_rate": 7.506640607241929e-05, + "loss": 0.9136, + "step": 2689 + }, + { + "epoch": 1.2194016319129646, + "grad_norm": 1.7909918714733484, + "learning_rate": 7.506031404909225e-05, + "loss": 0.8872, + "step": 2690 + }, + { + "epoch": 1.2198549410698096, + "grad_norm": 2.03219494262841, + "learning_rate": 7.505421851437172e-05, + "loss": 0.9305, + "step": 2691 + }, + { + "epoch": 1.2203082502266547, + "grad_norm": 1.7963836740328543, + "learning_rate": 7.504811946886819e-05, + "loss": 0.9068, + "step": 2692 + }, + { + "epoch": 1.2207615593834995, + "grad_norm": 1.9178597756301416, + "learning_rate": 7.504201691319248e-05, + "loss": 0.9008, + "step": 2693 + }, + { + "epoch": 1.2212148685403446, + "grad_norm": 1.742025602530271, + "learning_rate": 7.503591084795579e-05, + "loss": 0.909, + "step": 2694 + }, + { + "epoch": 1.2216681776971896, + "grad_norm": 1.8654612626709648, + "learning_rate": 7.502980127376967e-05, + "loss": 0.9031, + "step": 2695 + }, + { + "epoch": 1.2221214868540344, + "grad_norm": 1.597133069533321, + "learning_rate": 7.5023688191246e-05, + "loss": 0.885, + "step": 2696 + }, + { + "epoch": 1.2225747960108795, + "grad_norm": 2.1026772602110246, + "learning_rate": 7.501757160099702e-05, + "loss": 0.8845, + "step": 2697 + }, + { + "epoch": 1.2230281051677243, + "grad_norm": 1.8708771698199107, + "learning_rate": 7.501145150363533e-05, + "loss": 0.8904, + "step": 2698 + }, + { + "epoch": 1.2234814143245694, + "grad_norm": 1.8735471218277764, + "learning_rate": 7.500532789977387e-05, + "loss": 0.9136, + "step": 2699 + }, + { + "epoch": 1.2239347234814144, + "grad_norm": 1.809683873043487, + "learning_rate": 7.499920079002595e-05, + "loss": 0.9207, + "step": 2700 + }, + { + "epoch": 1.2243880326382592, + "grad_norm": 1.7742185709976002, + "learning_rate": 7.49930701750052e-05, + "loss": 0.9259, + "step": 2701 + }, + { + "epoch": 1.2248413417951043, + "grad_norm": 1.4659117821896706, + "learning_rate": 7.498693605532565e-05, + "loss": 0.9013, + "step": 2702 + }, + { + "epoch": 1.2252946509519491, + "grad_norm": 2.0729186372296056, + "learning_rate": 7.498079843160161e-05, + "loss": 0.9035, + "step": 2703 + }, + { + "epoch": 1.2257479601087942, + "grad_norm": 1.8879889127138745, + "learning_rate": 7.497465730444781e-05, + "loss": 0.9067, + "step": 2704 + }, + { + "epoch": 1.2262012692656392, + "grad_norm": 1.813488353166049, + "learning_rate": 7.496851267447929e-05, + "loss": 0.8882, + "step": 2705 + }, + { + "epoch": 1.226654578422484, + "grad_norm": 1.5838815716262524, + "learning_rate": 7.496236454231145e-05, + "loss": 0.9, + "step": 2706 + }, + { + "epoch": 1.227107887579329, + "grad_norm": 1.90146878547097, + "learning_rate": 7.495621290856006e-05, + "loss": 0.8831, + "step": 2707 + }, + { + "epoch": 1.2275611967361741, + "grad_norm": 1.6420619755310977, + "learning_rate": 7.49500577738412e-05, + "loss": 0.8989, + "step": 2708 + }, + { + "epoch": 1.228014505893019, + "grad_norm": 2.082441781138093, + "learning_rate": 7.494389913877135e-05, + "loss": 0.9048, + "step": 2709 + }, + { + "epoch": 1.228467815049864, + "grad_norm": 1.9026520482755052, + "learning_rate": 7.493773700396729e-05, + "loss": 0.8861, + "step": 2710 + }, + { + "epoch": 1.228921124206709, + "grad_norm": 1.5772012820829349, + "learning_rate": 7.49315713700462e-05, + "loss": 0.8958, + "step": 2711 + }, + { + "epoch": 1.229374433363554, + "grad_norm": 1.3780131485779892, + "learning_rate": 7.492540223762558e-05, + "loss": 0.9041, + "step": 2712 + }, + { + "epoch": 1.229827742520399, + "grad_norm": 2.1536790237191137, + "learning_rate": 7.491922960732327e-05, + "loss": 0.8976, + "step": 2713 + }, + { + "epoch": 1.230281051677244, + "grad_norm": 1.9009457784483477, + "learning_rate": 7.49130534797575e-05, + "loss": 0.9116, + "step": 2714 + }, + { + "epoch": 1.2307343608340888, + "grad_norm": 1.7520131747757777, + "learning_rate": 7.490687385554679e-05, + "loss": 0.9087, + "step": 2715 + }, + { + "epoch": 1.2311876699909339, + "grad_norm": 1.5823769286929505, + "learning_rate": 7.49006907353101e-05, + "loss": 0.8909, + "step": 2716 + }, + { + "epoch": 1.2316409791477787, + "grad_norm": 1.8905399898665851, + "learning_rate": 7.489450411966664e-05, + "loss": 0.903, + "step": 2717 + }, + { + "epoch": 1.2320942883046238, + "grad_norm": 1.6931906725993902, + "learning_rate": 7.488831400923606e-05, + "loss": 0.8816, + "step": 2718 + }, + { + "epoch": 1.2325475974614688, + "grad_norm": 1.9553616212446379, + "learning_rate": 7.488212040463829e-05, + "loss": 0.9182, + "step": 2719 + }, + { + "epoch": 1.2330009066183136, + "grad_norm": 1.7159734030777902, + "learning_rate": 7.487592330649364e-05, + "loss": 0.8935, + "step": 2720 + }, + { + "epoch": 1.2334542157751587, + "grad_norm": 1.8015132653417187, + "learning_rate": 7.486972271542279e-05, + "loss": 0.9022, + "step": 2721 + }, + { + "epoch": 1.2339075249320035, + "grad_norm": 1.7243408159729097, + "learning_rate": 7.486351863204671e-05, + "loss": 0.9232, + "step": 2722 + }, + { + "epoch": 1.2343608340888486, + "grad_norm": 1.8261865516342242, + "learning_rate": 7.485731105698679e-05, + "loss": 0.8873, + "step": 2723 + }, + { + "epoch": 1.2348141432456936, + "grad_norm": 1.5991734998721627, + "learning_rate": 7.485109999086471e-05, + "loss": 0.9246, + "step": 2724 + }, + { + "epoch": 1.2352674524025384, + "grad_norm": 1.8891654690467674, + "learning_rate": 7.484488543430256e-05, + "loss": 0.9029, + "step": 2725 + }, + { + "epoch": 1.2357207615593835, + "grad_norm": 1.683855970941086, + "learning_rate": 7.483866738792271e-05, + "loss": 0.8914, + "step": 2726 + }, + { + "epoch": 1.2361740707162285, + "grad_norm": 1.871828144972644, + "learning_rate": 7.483244585234794e-05, + "loss": 0.8963, + "step": 2727 + }, + { + "epoch": 1.2366273798730734, + "grad_norm": 1.6746149335558325, + "learning_rate": 7.482622082820135e-05, + "loss": 0.8954, + "step": 2728 + }, + { + "epoch": 1.2370806890299184, + "grad_norm": 1.7884878135942115, + "learning_rate": 7.481999231610638e-05, + "loss": 0.915, + "step": 2729 + }, + { + "epoch": 1.2375339981867635, + "grad_norm": 1.6936682925557023, + "learning_rate": 7.481376031668685e-05, + "loss": 0.9166, + "step": 2730 + }, + { + "epoch": 1.2379873073436083, + "grad_norm": 1.6727218200667755, + "learning_rate": 7.480752483056691e-05, + "loss": 0.8813, + "step": 2731 + }, + { + "epoch": 1.2384406165004533, + "grad_norm": 1.4037228894457867, + "learning_rate": 7.480128585837106e-05, + "loss": 0.8989, + "step": 2732 + }, + { + "epoch": 1.2388939256572984, + "grad_norm": 2.0601891950161106, + "learning_rate": 7.479504340072415e-05, + "loss": 0.9031, + "step": 2733 + }, + { + "epoch": 1.2393472348141432, + "grad_norm": 1.8747959624147512, + "learning_rate": 7.478879745825139e-05, + "loss": 0.8839, + "step": 2734 + }, + { + "epoch": 1.2398005439709883, + "grad_norm": 1.5659408410021274, + "learning_rate": 7.478254803157832e-05, + "loss": 0.9104, + "step": 2735 + }, + { + "epoch": 1.240253853127833, + "grad_norm": 1.3583009423552534, + "learning_rate": 7.477629512133083e-05, + "loss": 0.9003, + "step": 2736 + }, + { + "epoch": 1.2407071622846781, + "grad_norm": 2.008508425319694, + "learning_rate": 7.477003872813519e-05, + "loss": 0.8869, + "step": 2737 + }, + { + "epoch": 1.2411604714415232, + "grad_norm": 1.7722915531831243, + "learning_rate": 7.476377885261798e-05, + "loss": 0.932, + "step": 2738 + }, + { + "epoch": 1.241613780598368, + "grad_norm": 1.7252519321152524, + "learning_rate": 7.475751549540616e-05, + "loss": 0.8839, + "step": 2739 + }, + { + "epoch": 1.242067089755213, + "grad_norm": 1.5274243511320271, + "learning_rate": 7.4751248657127e-05, + "loss": 0.8876, + "step": 2740 + }, + { + "epoch": 1.242520398912058, + "grad_norm": 1.8420161707654008, + "learning_rate": 7.474497833840816e-05, + "loss": 0.9169, + "step": 2741 + }, + { + "epoch": 1.242973708068903, + "grad_norm": 1.617215497924602, + "learning_rate": 7.473870453987761e-05, + "loss": 0.9132, + "step": 2742 + }, + { + "epoch": 1.243427017225748, + "grad_norm": 1.6947892568151723, + "learning_rate": 7.473242726216372e-05, + "loss": 0.8973, + "step": 2743 + }, + { + "epoch": 1.2438803263825928, + "grad_norm": 1.4996261694741249, + "learning_rate": 7.472614650589517e-05, + "loss": 0.9113, + "step": 2744 + }, + { + "epoch": 1.2443336355394379, + "grad_norm": 1.9355126803283957, + "learning_rate": 7.471986227170098e-05, + "loss": 0.9103, + "step": 2745 + }, + { + "epoch": 1.244786944696283, + "grad_norm": 1.7400198163351488, + "learning_rate": 7.471357456021054e-05, + "loss": 0.9016, + "step": 2746 + }, + { + "epoch": 1.2452402538531278, + "grad_norm": 1.588577524635543, + "learning_rate": 7.47072833720536e-05, + "loss": 0.9188, + "step": 2747 + }, + { + "epoch": 1.2456935630099728, + "grad_norm": 1.3395797782395478, + "learning_rate": 7.470098870786021e-05, + "loss": 0.9071, + "step": 2748 + }, + { + "epoch": 1.2461468721668179, + "grad_norm": 2.0503788828560134, + "learning_rate": 7.469469056826082e-05, + "loss": 0.9158, + "step": 2749 + }, + { + "epoch": 1.2466001813236627, + "grad_norm": 1.8284025029872917, + "learning_rate": 7.468838895388622e-05, + "loss": 0.9097, + "step": 2750 + }, + { + "epoch": 1.2470534904805077, + "grad_norm": 1.5599825751058434, + "learning_rate": 7.46820838653675e-05, + "loss": 0.8955, + "step": 2751 + }, + { + "epoch": 1.2475067996373528, + "grad_norm": 1.398240013161401, + "learning_rate": 7.467577530333617e-05, + "loss": 0.9033, + "step": 2752 + }, + { + "epoch": 1.2479601087941976, + "grad_norm": 1.9367357360035298, + "learning_rate": 7.466946326842402e-05, + "loss": 0.9004, + "step": 2753 + }, + { + "epoch": 1.2484134179510427, + "grad_norm": 1.6601739193168126, + "learning_rate": 7.466314776126323e-05, + "loss": 0.8973, + "step": 2754 + }, + { + "epoch": 1.2488667271078875, + "grad_norm": 1.6498426152877235, + "learning_rate": 7.465682878248632e-05, + "loss": 0.8768, + "step": 2755 + }, + { + "epoch": 1.2493200362647325, + "grad_norm": 1.4524108849897486, + "learning_rate": 7.465050633272617e-05, + "loss": 0.8952, + "step": 2756 + }, + { + "epoch": 1.2497733454215776, + "grad_norm": 1.6907219219133094, + "learning_rate": 7.464418041261594e-05, + "loss": 0.906, + "step": 2757 + }, + { + "epoch": 1.2502266545784224, + "grad_norm": 1.3537745282104527, + "learning_rate": 7.463785102278925e-05, + "loss": 0.8807, + "step": 2758 + }, + { + "epoch": 1.2506799637352675, + "grad_norm": 1.9229887472129625, + "learning_rate": 7.463151816387998e-05, + "loss": 0.8928, + "step": 2759 + }, + { + "epoch": 1.2511332728921123, + "grad_norm": 1.7400181994496158, + "learning_rate": 7.462518183652239e-05, + "loss": 0.9143, + "step": 2760 + }, + { + "epoch": 1.2515865820489573, + "grad_norm": 1.46511799618278, + "learning_rate": 7.461884204135107e-05, + "loss": 0.9046, + "step": 2761 + }, + { + "epoch": 1.2520398912058024, + "grad_norm": 1.254218207830477, + "learning_rate": 7.461249877900097e-05, + "loss": 0.9005, + "step": 2762 + }, + { + "epoch": 1.2524932003626472, + "grad_norm": 1.7215257944744704, + "learning_rate": 7.46061520501074e-05, + "loss": 0.8947, + "step": 2763 + }, + { + "epoch": 1.2529465095194923, + "grad_norm": 1.2961843150936996, + "learning_rate": 7.4599801855306e-05, + "loss": 0.9059, + "step": 2764 + }, + { + "epoch": 1.2533998186763373, + "grad_norm": 1.875096489384654, + "learning_rate": 7.459344819523274e-05, + "loss": 0.9015, + "step": 2765 + }, + { + "epoch": 1.2538531278331821, + "grad_norm": 1.662992670855282, + "learning_rate": 7.4587091070524e-05, + "loss": 0.9174, + "step": 2766 + }, + { + "epoch": 1.2543064369900272, + "grad_norm": 1.4680303606350409, + "learning_rate": 7.458073048181643e-05, + "loss": 0.8962, + "step": 2767 + }, + { + "epoch": 1.2547597461468722, + "grad_norm": 1.2907941800360798, + "learning_rate": 7.457436642974707e-05, + "loss": 0.9126, + "step": 2768 + }, + { + "epoch": 1.255213055303717, + "grad_norm": 1.3213262720109202, + "learning_rate": 7.45679989149533e-05, + "loss": 0.9028, + "step": 2769 + }, + { + "epoch": 1.2556663644605621, + "grad_norm": 1.041684042127033, + "learning_rate": 7.456162793807284e-05, + "loss": 0.9265, + "step": 2770 + }, + { + "epoch": 1.2561196736174072, + "grad_norm": 1.3429189637957157, + "learning_rate": 7.455525349974377e-05, + "loss": 0.8939, + "step": 2771 + }, + { + "epoch": 1.256572982774252, + "grad_norm": 0.8713616023441197, + "learning_rate": 7.454887560060452e-05, + "loss": 0.9153, + "step": 2772 + }, + { + "epoch": 1.257026291931097, + "grad_norm": 1.594961744647598, + "learning_rate": 7.454249424129383e-05, + "loss": 0.8986, + "step": 2773 + }, + { + "epoch": 1.257479601087942, + "grad_norm": 1.2351292255340587, + "learning_rate": 7.453610942245082e-05, + "loss": 0.9138, + "step": 2774 + }, + { + "epoch": 1.257932910244787, + "grad_norm": 1.48091807514733, + "learning_rate": 7.452972114471495e-05, + "loss": 0.9075, + "step": 2775 + }, + { + "epoch": 1.258386219401632, + "grad_norm": 1.4994931745104434, + "learning_rate": 7.452332940872604e-05, + "loss": 0.9066, + "step": 2776 + }, + { + "epoch": 1.2588395285584768, + "grad_norm": 0.972044662773447, + "learning_rate": 7.451693421512421e-05, + "loss": 0.8761, + "step": 2777 + }, + { + "epoch": 1.2592928377153219, + "grad_norm": 1.2024799628999079, + "learning_rate": 7.451053556454999e-05, + "loss": 0.8998, + "step": 2778 + }, + { + "epoch": 1.2597461468721667, + "grad_norm": 1.0869982532299058, + "learning_rate": 7.450413345764419e-05, + "loss": 0.9263, + "step": 2779 + }, + { + "epoch": 1.2601994560290117, + "grad_norm": 0.9441442222229482, + "learning_rate": 7.449772789504803e-05, + "loss": 0.9256, + "step": 2780 + }, + { + "epoch": 1.2606527651858568, + "grad_norm": 0.9780116663416278, + "learning_rate": 7.449131887740304e-05, + "loss": 0.9187, + "step": 2781 + }, + { + "epoch": 1.2611060743427016, + "grad_norm": 1.0358536243900056, + "learning_rate": 7.448490640535109e-05, + "loss": 0.9036, + "step": 2782 + }, + { + "epoch": 1.2615593834995467, + "grad_norm": 0.9456460198915644, + "learning_rate": 7.44784904795344e-05, + "loss": 0.9135, + "step": 2783 + }, + { + "epoch": 1.2620126926563917, + "grad_norm": 0.8749309871844312, + "learning_rate": 7.447207110059559e-05, + "loss": 0.9035, + "step": 2784 + }, + { + "epoch": 1.2624660018132365, + "grad_norm": 0.9901683114187291, + "learning_rate": 7.446564826917753e-05, + "loss": 0.9215, + "step": 2785 + }, + { + "epoch": 1.2629193109700816, + "grad_norm": 1.1614513822593815, + "learning_rate": 7.44592219859235e-05, + "loss": 0.9022, + "step": 2786 + }, + { + "epoch": 1.2633726201269266, + "grad_norm": 0.649150738574164, + "learning_rate": 7.445279225147712e-05, + "loss": 0.908, + "step": 2787 + }, + { + "epoch": 1.2638259292837715, + "grad_norm": 0.9139957395998561, + "learning_rate": 7.444635906648234e-05, + "loss": 0.9053, + "step": 2788 + }, + { + "epoch": 1.2642792384406165, + "grad_norm": 1.0251810874947496, + "learning_rate": 7.443992243158347e-05, + "loss": 0.9058, + "step": 2789 + }, + { + "epoch": 1.2647325475974616, + "grad_norm": 0.7458403985488183, + "learning_rate": 7.443348234742513e-05, + "loss": 0.9169, + "step": 2790 + }, + { + "epoch": 1.2651858567543064, + "grad_norm": 0.8833017746077881, + "learning_rate": 7.442703881465235e-05, + "loss": 0.9184, + "step": 2791 + }, + { + "epoch": 1.2656391659111514, + "grad_norm": 0.9033171015286059, + "learning_rate": 7.442059183391046e-05, + "loss": 0.894, + "step": 2792 + }, + { + "epoch": 1.2660924750679965, + "grad_norm": 0.7257536767082956, + "learning_rate": 7.441414140584513e-05, + "loss": 0.8793, + "step": 2793 + }, + { + "epoch": 1.2665457842248413, + "grad_norm": 0.6961662635470214, + "learning_rate": 7.44076875311024e-05, + "loss": 0.9084, + "step": 2794 + }, + { + "epoch": 1.2669990933816864, + "grad_norm": 0.8392778304244884, + "learning_rate": 7.440123021032863e-05, + "loss": 0.8897, + "step": 2795 + }, + { + "epoch": 1.2674524025385312, + "grad_norm": 0.8951828745794115, + "learning_rate": 7.439476944417056e-05, + "loss": 0.92, + "step": 2796 + }, + { + "epoch": 1.2679057116953762, + "grad_norm": 0.9013212835822012, + "learning_rate": 7.438830523327525e-05, + "loss": 0.8934, + "step": 2797 + }, + { + "epoch": 1.2683590208522213, + "grad_norm": 0.888067211175819, + "learning_rate": 7.43818375782901e-05, + "loss": 0.8829, + "step": 2798 + }, + { + "epoch": 1.2688123300090661, + "grad_norm": 1.0110713461352794, + "learning_rate": 7.437536647986287e-05, + "loss": 0.9138, + "step": 2799 + }, + { + "epoch": 1.2692656391659112, + "grad_norm": 1.3409204134814257, + "learning_rate": 7.436889193864166e-05, + "loss": 0.8924, + "step": 2800 + }, + { + "epoch": 1.269718948322756, + "grad_norm": 0.4980900470681856, + "learning_rate": 7.436241395527492e-05, + "loss": 0.9266, + "step": 2801 + }, + { + "epoch": 1.270172257479601, + "grad_norm": 0.8777564080683881, + "learning_rate": 7.435593253041143e-05, + "loss": 0.8841, + "step": 2802 + }, + { + "epoch": 1.270625566636446, + "grad_norm": 1.37943870671828, + "learning_rate": 7.434944766470032e-05, + "loss": 0.9053, + "step": 2803 + }, + { + "epoch": 1.271078875793291, + "grad_norm": 0.7554932691753187, + "learning_rate": 7.434295935879107e-05, + "loss": 0.8878, + "step": 2804 + }, + { + "epoch": 1.271532184950136, + "grad_norm": 0.7929751180988278, + "learning_rate": 7.433646761333353e-05, + "loss": 0.9014, + "step": 2805 + }, + { + "epoch": 1.271985494106981, + "grad_norm": 0.7004902441471335, + "learning_rate": 7.432997242897782e-05, + "loss": 0.8936, + "step": 2806 + }, + { + "epoch": 1.2724388032638259, + "grad_norm": 0.7810761362172799, + "learning_rate": 7.43234738063745e-05, + "loss": 0.9112, + "step": 2807 + }, + { + "epoch": 1.272892112420671, + "grad_norm": 1.0758208005212655, + "learning_rate": 7.43169717461744e-05, + "loss": 0.9044, + "step": 2808 + }, + { + "epoch": 1.273345421577516, + "grad_norm": 0.9937596121398983, + "learning_rate": 7.431046624902872e-05, + "loss": 0.8982, + "step": 2809 + }, + { + "epoch": 1.2737987307343608, + "grad_norm": 0.9949584909821253, + "learning_rate": 7.430395731558901e-05, + "loss": 0.909, + "step": 2810 + }, + { + "epoch": 1.2742520398912058, + "grad_norm": 0.9974708349463909, + "learning_rate": 7.429744494650715e-05, + "loss": 0.9124, + "step": 2811 + }, + { + "epoch": 1.2747053490480509, + "grad_norm": 1.0294334639365597, + "learning_rate": 7.429092914243538e-05, + "loss": 0.9126, + "step": 2812 + }, + { + "epoch": 1.2751586582048957, + "grad_norm": 0.9801839918991749, + "learning_rate": 7.428440990402629e-05, + "loss": 0.9091, + "step": 2813 + }, + { + "epoch": 1.2756119673617408, + "grad_norm": 0.7966394929115791, + "learning_rate": 7.427788723193277e-05, + "loss": 0.8904, + "step": 2814 + }, + { + "epoch": 1.2760652765185858, + "grad_norm": 0.6092948883307407, + "learning_rate": 7.427136112680813e-05, + "loss": 0.8933, + "step": 2815 + }, + { + "epoch": 1.2765185856754306, + "grad_norm": 0.49800574782716495, + "learning_rate": 7.426483158930593e-05, + "loss": 0.9117, + "step": 2816 + }, + { + "epoch": 1.2769718948322757, + "grad_norm": 0.46278273407850185, + "learning_rate": 7.425829862008014e-05, + "loss": 0.8868, + "step": 2817 + }, + { + "epoch": 1.2774252039891205, + "grad_norm": 0.48655073469382504, + "learning_rate": 7.425176221978507e-05, + "loss": 0.8857, + "step": 2818 + }, + { + "epoch": 1.2778785131459656, + "grad_norm": 0.7482665169370137, + "learning_rate": 7.424522238907534e-05, + "loss": 0.9163, + "step": 2819 + }, + { + "epoch": 1.2783318223028104, + "grad_norm": 1.0582179142978834, + "learning_rate": 7.423867912860596e-05, + "loss": 0.9086, + "step": 2820 + }, + { + "epoch": 1.2787851314596554, + "grad_norm": 1.086794828597155, + "learning_rate": 7.423213243903222e-05, + "loss": 0.9103, + "step": 2821 + }, + { + "epoch": 1.2792384406165005, + "grad_norm": 0.7942115464105454, + "learning_rate": 7.422558232100983e-05, + "loss": 0.8933, + "step": 2822 + }, + { + "epoch": 1.2796917497733453, + "grad_norm": 0.985008591443496, + "learning_rate": 7.421902877519477e-05, + "loss": 0.9285, + "step": 2823 + }, + { + "epoch": 1.2801450589301904, + "grad_norm": 0.5046315051217325, + "learning_rate": 7.42124718022434e-05, + "loss": 0.928, + "step": 2824 + }, + { + "epoch": 1.2805983680870354, + "grad_norm": 0.5535682697353814, + "learning_rate": 7.420591140281246e-05, + "loss": 0.8883, + "step": 2825 + }, + { + "epoch": 1.2810516772438802, + "grad_norm": 0.9130320151061047, + "learning_rate": 7.419934757755895e-05, + "loss": 0.899, + "step": 2826 + }, + { + "epoch": 1.2815049864007253, + "grad_norm": 1.1893950467824734, + "learning_rate": 7.419278032714026e-05, + "loss": 0.929, + "step": 2827 + }, + { + "epoch": 1.2819582955575703, + "grad_norm": 1.0227651199238244, + "learning_rate": 7.418620965221414e-05, + "loss": 0.9048, + "step": 2828 + }, + { + "epoch": 1.2824116047144152, + "grad_norm": 0.85244263384307, + "learning_rate": 7.417963555343865e-05, + "loss": 0.9128, + "step": 2829 + }, + { + "epoch": 1.2828649138712602, + "grad_norm": 0.7706451942033589, + "learning_rate": 7.417305803147222e-05, + "loss": 0.9087, + "step": 2830 + }, + { + "epoch": 1.2833182230281053, + "grad_norm": 0.7580065360100893, + "learning_rate": 7.416647708697357e-05, + "loss": 0.9055, + "step": 2831 + }, + { + "epoch": 1.28377153218495, + "grad_norm": 0.7352430694402456, + "learning_rate": 7.415989272060183e-05, + "loss": 0.889, + "step": 2832 + }, + { + "epoch": 1.2842248413417952, + "grad_norm": 0.5719524114791386, + "learning_rate": 7.415330493301645e-05, + "loss": 0.8995, + "step": 2833 + }, + { + "epoch": 1.2846781504986402, + "grad_norm": 0.5182972581525928, + "learning_rate": 7.414671372487722e-05, + "loss": 0.907, + "step": 2834 + }, + { + "epoch": 1.285131459655485, + "grad_norm": 1.273065220770592, + "learning_rate": 7.414011909684424e-05, + "loss": 0.8926, + "step": 2835 + }, + { + "epoch": 1.28558476881233, + "grad_norm": 0.5046523177249785, + "learning_rate": 7.4133521049578e-05, + "loss": 0.8979, + "step": 2836 + }, + { + "epoch": 1.286038077969175, + "grad_norm": 0.6296165716369754, + "learning_rate": 7.412691958373929e-05, + "loss": 0.9351, + "step": 2837 + }, + { + "epoch": 1.28649138712602, + "grad_norm": 0.6308121397511393, + "learning_rate": 7.41203146999893e-05, + "loss": 0.9162, + "step": 2838 + }, + { + "epoch": 1.2869446962828648, + "grad_norm": 0.7474328349549887, + "learning_rate": 7.411370639898951e-05, + "loss": 0.8819, + "step": 2839 + }, + { + "epoch": 1.2873980054397098, + "grad_norm": 0.9419680915471272, + "learning_rate": 7.410709468140178e-05, + "loss": 0.9161, + "step": 2840 + }, + { + "epoch": 1.2878513145965549, + "grad_norm": 0.9999596514780268, + "learning_rate": 7.410047954788828e-05, + "loss": 0.9009, + "step": 2841 + }, + { + "epoch": 1.2883046237533997, + "grad_norm": 1.1302892670679667, + "learning_rate": 7.409386099911152e-05, + "loss": 0.9063, + "step": 2842 + }, + { + "epoch": 1.2887579329102448, + "grad_norm": 0.9700571083557177, + "learning_rate": 7.408723903573439e-05, + "loss": 0.9186, + "step": 2843 + }, + { + "epoch": 1.2892112420670898, + "grad_norm": 1.002958079331834, + "learning_rate": 7.40806136584201e-05, + "loss": 0.9039, + "step": 2844 + }, + { + "epoch": 1.2896645512239346, + "grad_norm": 1.0920836187039127, + "learning_rate": 7.407398486783218e-05, + "loss": 0.9074, + "step": 2845 + }, + { + "epoch": 1.2901178603807797, + "grad_norm": 0.9394651351415685, + "learning_rate": 7.406735266463452e-05, + "loss": 0.8932, + "step": 2846 + }, + { + "epoch": 1.2905711695376247, + "grad_norm": 0.8839424163854838, + "learning_rate": 7.406071704949139e-05, + "loss": 0.9, + "step": 2847 + }, + { + "epoch": 1.2910244786944696, + "grad_norm": 1.091342857410111, + "learning_rate": 7.405407802306733e-05, + "loss": 0.9039, + "step": 2848 + }, + { + "epoch": 1.2914777878513146, + "grad_norm": 1.0000072591616014, + "learning_rate": 7.404743558602728e-05, + "loss": 0.8706, + "step": 2849 + }, + { + "epoch": 1.2919310970081597, + "grad_norm": 0.9343521101546638, + "learning_rate": 7.40407897390365e-05, + "loss": 0.8788, + "step": 2850 + }, + { + "epoch": 1.2923844061650045, + "grad_norm": 0.8964729551799999, + "learning_rate": 7.403414048276056e-05, + "loss": 0.891, + "step": 2851 + }, + { + "epoch": 1.2928377153218495, + "grad_norm": 0.8127383621882387, + "learning_rate": 7.402748781786545e-05, + "loss": 0.9078, + "step": 2852 + }, + { + "epoch": 1.2932910244786946, + "grad_norm": 0.7593207431366779, + "learning_rate": 7.402083174501742e-05, + "loss": 0.9076, + "step": 2853 + }, + { + "epoch": 1.2937443336355394, + "grad_norm": 0.782742940584997, + "learning_rate": 7.40141722648831e-05, + "loss": 0.8877, + "step": 2854 + }, + { + "epoch": 1.2941976427923845, + "grad_norm": 0.7358200835686886, + "learning_rate": 7.400750937812948e-05, + "loss": 0.9092, + "step": 2855 + }, + { + "epoch": 1.2946509519492293, + "grad_norm": 0.7007752785379677, + "learning_rate": 7.400084308542383e-05, + "loss": 0.9081, + "step": 2856 + }, + { + "epoch": 1.2951042611060744, + "grad_norm": 0.658513761499961, + "learning_rate": 7.399417338743382e-05, + "loss": 0.9165, + "step": 2857 + }, + { + "epoch": 1.2955575702629192, + "grad_norm": 0.7572329964912522, + "learning_rate": 7.398750028482745e-05, + "loss": 0.8997, + "step": 2858 + }, + { + "epoch": 1.2960108794197642, + "grad_norm": 0.9642529612663617, + "learning_rate": 7.398082377827303e-05, + "loss": 0.8911, + "step": 2859 + }, + { + "epoch": 1.2964641885766093, + "grad_norm": 1.2778413155741908, + "learning_rate": 7.397414386843925e-05, + "loss": 0.9197, + "step": 2860 + }, + { + "epoch": 1.296917497733454, + "grad_norm": 0.660699099373726, + "learning_rate": 7.39674605559951e-05, + "loss": 0.9008, + "step": 2861 + }, + { + "epoch": 1.2973708068902992, + "grad_norm": 0.5226072065553381, + "learning_rate": 7.396077384160996e-05, + "loss": 0.8971, + "step": 2862 + }, + { + "epoch": 1.2978241160471442, + "grad_norm": 0.8441193802928766, + "learning_rate": 7.39540837259535e-05, + "loss": 0.8973, + "step": 2863 + }, + { + "epoch": 1.298277425203989, + "grad_norm": 1.3174905516901871, + "learning_rate": 7.394739020969579e-05, + "loss": 0.9117, + "step": 2864 + }, + { + "epoch": 1.298730734360834, + "grad_norm": 0.7690746071873821, + "learning_rate": 7.394069329350714e-05, + "loss": 0.8953, + "step": 2865 + }, + { + "epoch": 1.2991840435176791, + "grad_norm": 0.6780358081780408, + "learning_rate": 7.393399297805834e-05, + "loss": 0.9433, + "step": 2866 + }, + { + "epoch": 1.299637352674524, + "grad_norm": 0.8345198250897263, + "learning_rate": 7.39272892640204e-05, + "loss": 0.9045, + "step": 2867 + }, + { + "epoch": 1.300090661831369, + "grad_norm": 1.0435416767681633, + "learning_rate": 7.392058215206474e-05, + "loss": 0.8984, + "step": 2868 + }, + { + "epoch": 1.300543970988214, + "grad_norm": 0.9190976974141719, + "learning_rate": 7.391387164286307e-05, + "loss": 0.9018, + "step": 2869 + }, + { + "epoch": 1.3009972801450589, + "grad_norm": 0.9193921432049867, + "learning_rate": 7.390715773708749e-05, + "loss": 0.8909, + "step": 2870 + }, + { + "epoch": 1.301450589301904, + "grad_norm": 1.2534185142125913, + "learning_rate": 7.390044043541041e-05, + "loss": 0.9154, + "step": 2871 + }, + { + "epoch": 1.301903898458749, + "grad_norm": 0.8798656777462052, + "learning_rate": 7.38937197385046e-05, + "loss": 0.8975, + "step": 2872 + }, + { + "epoch": 1.3023572076155938, + "grad_norm": 0.7407851546296778, + "learning_rate": 7.388699564704313e-05, + "loss": 0.8941, + "step": 2873 + }, + { + "epoch": 1.3028105167724389, + "grad_norm": 0.6009156886899882, + "learning_rate": 7.388026816169947e-05, + "loss": 0.9057, + "step": 2874 + }, + { + "epoch": 1.3032638259292837, + "grad_norm": 0.7959769362547559, + "learning_rate": 7.387353728314738e-05, + "loss": 0.8883, + "step": 2875 + }, + { + "epoch": 1.3037171350861287, + "grad_norm": 1.205873502867238, + "learning_rate": 7.386680301206097e-05, + "loss": 0.8913, + "step": 2876 + }, + { + "epoch": 1.3041704442429736, + "grad_norm": 0.9440603717321702, + "learning_rate": 7.386006534911471e-05, + "loss": 0.898, + "step": 2877 + }, + { + "epoch": 1.3046237533998186, + "grad_norm": 0.8674253529802938, + "learning_rate": 7.385332429498339e-05, + "loss": 0.8946, + "step": 2878 + }, + { + "epoch": 1.3050770625566637, + "grad_norm": 0.8661327326303528, + "learning_rate": 7.384657985034215e-05, + "loss": 0.8944, + "step": 2879 + }, + { + "epoch": 1.3055303717135085, + "grad_norm": 0.8814598280571068, + "learning_rate": 7.383983201586645e-05, + "loss": 0.9042, + "step": 2880 + }, + { + "epoch": 1.3059836808703535, + "grad_norm": 1.0659377707934474, + "learning_rate": 7.383308079223215e-05, + "loss": 0.92, + "step": 2881 + }, + { + "epoch": 1.3064369900271986, + "grad_norm": 1.0347338328608489, + "learning_rate": 7.382632618011536e-05, + "loss": 0.9106, + "step": 2882 + }, + { + "epoch": 1.3068902991840434, + "grad_norm": 0.8681158264095176, + "learning_rate": 7.381956818019258e-05, + "loss": 0.9102, + "step": 2883 + }, + { + "epoch": 1.3073436083408885, + "grad_norm": 0.8611081800663157, + "learning_rate": 7.381280679314067e-05, + "loss": 0.9033, + "step": 2884 + }, + { + "epoch": 1.3077969174977335, + "grad_norm": 1.0055356132593627, + "learning_rate": 7.380604201963677e-05, + "loss": 0.9098, + "step": 2885 + }, + { + "epoch": 1.3082502266545784, + "grad_norm": 1.001293690927422, + "learning_rate": 7.379927386035841e-05, + "loss": 0.9017, + "step": 2886 + }, + { + "epoch": 1.3087035358114234, + "grad_norm": 1.1414233429542675, + "learning_rate": 7.379250231598346e-05, + "loss": 0.8971, + "step": 2887 + }, + { + "epoch": 1.3091568449682685, + "grad_norm": 0.9209685151217206, + "learning_rate": 7.378572738719007e-05, + "loss": 0.8903, + "step": 2888 + }, + { + "epoch": 1.3096101541251133, + "grad_norm": 0.7673543994027892, + "learning_rate": 7.377894907465679e-05, + "loss": 0.8913, + "step": 2889 + }, + { + "epoch": 1.3100634632819583, + "grad_norm": 0.6353892089702549, + "learning_rate": 7.377216737906247e-05, + "loss": 0.9096, + "step": 2890 + }, + { + "epoch": 1.3105167724388034, + "grad_norm": 0.6513162745162514, + "learning_rate": 7.376538230108636e-05, + "loss": 0.8955, + "step": 2891 + }, + { + "epoch": 1.3109700815956482, + "grad_norm": 0.6326693372597099, + "learning_rate": 7.375859384140797e-05, + "loss": 0.8794, + "step": 2892 + }, + { + "epoch": 1.3114233907524933, + "grad_norm": 0.617010622185089, + "learning_rate": 7.375180200070718e-05, + "loss": 0.9218, + "step": 2893 + }, + { + "epoch": 1.3118766999093383, + "grad_norm": 0.580381014100176, + "learning_rate": 7.374500677966424e-05, + "loss": 0.8859, + "step": 2894 + }, + { + "epoch": 1.3123300090661831, + "grad_norm": 0.4847734638933181, + "learning_rate": 7.373820817895968e-05, + "loss": 0.8932, + "step": 2895 + }, + { + "epoch": 1.3127833182230282, + "grad_norm": 0.42468212059722316, + "learning_rate": 7.373140619927444e-05, + "loss": 0.9063, + "step": 2896 + }, + { + "epoch": 1.313236627379873, + "grad_norm": 0.5587875451031711, + "learning_rate": 7.372460084128971e-05, + "loss": 0.9085, + "step": 2897 + }, + { + "epoch": 1.313689936536718, + "grad_norm": 0.5939978833879807, + "learning_rate": 7.37177921056871e-05, + "loss": 0.8956, + "step": 2898 + }, + { + "epoch": 1.314143245693563, + "grad_norm": 0.5739417007163263, + "learning_rate": 7.371097999314852e-05, + "loss": 0.9078, + "step": 2899 + }, + { + "epoch": 1.314596554850408, + "grad_norm": 0.5506685952125578, + "learning_rate": 7.370416450435622e-05, + "loss": 0.9044, + "step": 2900 + }, + { + "epoch": 1.315049864007253, + "grad_norm": 0.645396556450811, + "learning_rate": 7.369734563999278e-05, + "loss": 0.8976, + "step": 2901 + }, + { + "epoch": 1.3155031731640978, + "grad_norm": 0.8290957116952254, + "learning_rate": 7.369052340074116e-05, + "loss": 0.8871, + "step": 2902 + }, + { + "epoch": 1.3159564823209429, + "grad_norm": 0.9759915683814087, + "learning_rate": 7.368369778728458e-05, + "loss": 0.9126, + "step": 2903 + }, + { + "epoch": 1.316409791477788, + "grad_norm": 1.0707187667451525, + "learning_rate": 7.367686880030668e-05, + "loss": 0.8999, + "step": 2904 + }, + { + "epoch": 1.3168631006346327, + "grad_norm": 0.8788747545805095, + "learning_rate": 7.36700364404914e-05, + "loss": 0.8895, + "step": 2905 + }, + { + "epoch": 1.3173164097914778, + "grad_norm": 0.756677476564175, + "learning_rate": 7.366320070852302e-05, + "loss": 0.9217, + "step": 2906 + }, + { + "epoch": 1.3177697189483228, + "grad_norm": 0.779676925501923, + "learning_rate": 7.365636160508615e-05, + "loss": 0.9156, + "step": 2907 + }, + { + "epoch": 1.3182230281051677, + "grad_norm": 1.0372016233599939, + "learning_rate": 7.364951913086575e-05, + "loss": 0.9073, + "step": 2908 + }, + { + "epoch": 1.3186763372620127, + "grad_norm": 1.3512113967219046, + "learning_rate": 7.364267328654712e-05, + "loss": 0.9087, + "step": 2909 + }, + { + "epoch": 1.3191296464188578, + "grad_norm": 0.7633052012659666, + "learning_rate": 7.363582407281588e-05, + "loss": 0.8896, + "step": 2910 + }, + { + "epoch": 1.3195829555757026, + "grad_norm": 0.6735579205731137, + "learning_rate": 7.362897149035802e-05, + "loss": 0.8872, + "step": 2911 + }, + { + "epoch": 1.3200362647325476, + "grad_norm": 0.7329080712441788, + "learning_rate": 7.36221155398598e-05, + "loss": 0.9163, + "step": 2912 + }, + { + "epoch": 1.3204895738893927, + "grad_norm": 1.0357556201658245, + "learning_rate": 7.361525622200792e-05, + "loss": 0.8956, + "step": 2913 + }, + { + "epoch": 1.3209428830462375, + "grad_norm": 1.333439730341162, + "learning_rate": 7.360839353748933e-05, + "loss": 0.8958, + "step": 2914 + }, + { + "epoch": 1.3213961922030826, + "grad_norm": 0.6937496447373178, + "learning_rate": 7.360152748699136e-05, + "loss": 0.9092, + "step": 2915 + }, + { + "epoch": 1.3218495013599274, + "grad_norm": 0.9403622123520983, + "learning_rate": 7.359465807120166e-05, + "loss": 0.8837, + "step": 2916 + }, + { + "epoch": 1.3223028105167725, + "grad_norm": 1.2413947723350593, + "learning_rate": 7.358778529080821e-05, + "loss": 0.8888, + "step": 2917 + }, + { + "epoch": 1.3227561196736173, + "grad_norm": 0.9948171359956905, + "learning_rate": 7.358090914649935e-05, + "loss": 0.9021, + "step": 2918 + }, + { + "epoch": 1.3232094288304623, + "grad_norm": 1.4461414384242657, + "learning_rate": 7.357402963896375e-05, + "loss": 0.8918, + "step": 2919 + }, + { + "epoch": 1.3236627379873074, + "grad_norm": 0.5657998991292276, + "learning_rate": 7.356714676889041e-05, + "loss": 0.8926, + "step": 2920 + }, + { + "epoch": 1.3241160471441522, + "grad_norm": 1.22923428244796, + "learning_rate": 7.356026053696867e-05, + "loss": 0.914, + "step": 2921 + }, + { + "epoch": 1.3245693563009973, + "grad_norm": 1.1951682932728906, + "learning_rate": 7.355337094388821e-05, + "loss": 0.9115, + "step": 2922 + }, + { + "epoch": 1.3250226654578423, + "grad_norm": 1.0103246487817912, + "learning_rate": 7.354647799033903e-05, + "loss": 0.9163, + "step": 2923 + }, + { + "epoch": 1.3254759746146871, + "grad_norm": 0.9627538832481481, + "learning_rate": 7.35395816770115e-05, + "loss": 0.8995, + "step": 2924 + }, + { + "epoch": 1.3259292837715322, + "grad_norm": 0.6121644218383729, + "learning_rate": 7.353268200459628e-05, + "loss": 0.8902, + "step": 2925 + }, + { + "epoch": 1.3263825929283772, + "grad_norm": 0.9171020581164868, + "learning_rate": 7.352577897378441e-05, + "loss": 0.8763, + "step": 2926 + }, + { + "epoch": 1.326835902085222, + "grad_norm": 1.0994945969434344, + "learning_rate": 7.351887258526725e-05, + "loss": 0.9279, + "step": 2927 + }, + { + "epoch": 1.3272892112420671, + "grad_norm": 1.0649405331579191, + "learning_rate": 7.351196283973648e-05, + "loss": 0.9058, + "step": 2928 + }, + { + "epoch": 1.3277425203989122, + "grad_norm": 1.3283440589525823, + "learning_rate": 7.350504973788416e-05, + "loss": 0.8955, + "step": 2929 + }, + { + "epoch": 1.328195829555757, + "grad_norm": 0.7402710184124817, + "learning_rate": 7.349813328040263e-05, + "loss": 0.9027, + "step": 2930 + }, + { + "epoch": 1.328649138712602, + "grad_norm": 1.1567175009256359, + "learning_rate": 7.349121346798461e-05, + "loss": 0.9392, + "step": 2931 + }, + { + "epoch": 1.329102447869447, + "grad_norm": 0.9860586132818991, + "learning_rate": 7.348429030132312e-05, + "loss": 0.899, + "step": 2932 + }, + { + "epoch": 1.329555757026292, + "grad_norm": 1.2578597978669448, + "learning_rate": 7.347736378111155e-05, + "loss": 0.8848, + "step": 2933 + }, + { + "epoch": 1.330009066183137, + "grad_norm": 0.7984297582789955, + "learning_rate": 7.347043390804361e-05, + "loss": 0.9268, + "step": 2934 + }, + { + "epoch": 1.3304623753399818, + "grad_norm": 0.9027271067210628, + "learning_rate": 7.346350068281335e-05, + "loss": 0.8966, + "step": 2935 + }, + { + "epoch": 1.3309156844968268, + "grad_norm": 0.839167514927094, + "learning_rate": 7.345656410611515e-05, + "loss": 0.9119, + "step": 2936 + }, + { + "epoch": 1.3313689936536717, + "grad_norm": 0.8865241614370813, + "learning_rate": 7.344962417864372e-05, + "loss": 0.9142, + "step": 2937 + }, + { + "epoch": 1.3318223028105167, + "grad_norm": 1.0955811283782844, + "learning_rate": 7.344268090109414e-05, + "loss": 0.9087, + "step": 2938 + }, + { + "epoch": 1.3322756119673618, + "grad_norm": 1.004664735230567, + "learning_rate": 7.343573427416175e-05, + "loss": 0.9139, + "step": 2939 + }, + { + "epoch": 1.3327289211242066, + "grad_norm": 1.153070526870687, + "learning_rate": 7.342878429854233e-05, + "loss": 0.8863, + "step": 2940 + }, + { + "epoch": 1.3331822302810517, + "grad_norm": 0.901121427913594, + "learning_rate": 7.342183097493192e-05, + "loss": 0.8898, + "step": 2941 + }, + { + "epoch": 1.3336355394378967, + "grad_norm": 0.9130704665676795, + "learning_rate": 7.34148743040269e-05, + "loss": 0.9146, + "step": 2942 + }, + { + "epoch": 1.3340888485947415, + "grad_norm": 0.7458073600295562, + "learning_rate": 7.340791428652401e-05, + "loss": 0.9141, + "step": 2943 + }, + { + "epoch": 1.3345421577515866, + "grad_norm": 0.798978924186664, + "learning_rate": 7.340095092312034e-05, + "loss": 0.9141, + "step": 2944 + }, + { + "epoch": 1.3349954669084316, + "grad_norm": 0.8397225009178588, + "learning_rate": 7.339398421451325e-05, + "loss": 0.9066, + "step": 2945 + }, + { + "epoch": 1.3354487760652765, + "grad_norm": 0.7399518594223755, + "learning_rate": 7.338701416140051e-05, + "loss": 0.9093, + "step": 2946 + }, + { + "epoch": 1.3359020852221215, + "grad_norm": 0.963820662402483, + "learning_rate": 7.338004076448018e-05, + "loss": 0.9105, + "step": 2947 + }, + { + "epoch": 1.3363553943789666, + "grad_norm": 1.2464560978595205, + "learning_rate": 7.337306402445066e-05, + "loss": 0.9172, + "step": 2948 + }, + { + "epoch": 1.3368087035358114, + "grad_norm": 0.8981420268898725, + "learning_rate": 7.33660839420107e-05, + "loss": 0.919, + "step": 2949 + }, + { + "epoch": 1.3372620126926564, + "grad_norm": 0.8485164708576425, + "learning_rate": 7.335910051785938e-05, + "loss": 0.8889, + "step": 2950 + }, + { + "epoch": 1.3377153218495015, + "grad_norm": 0.7660953604999216, + "learning_rate": 7.335211375269609e-05, + "loss": 0.9051, + "step": 2951 + }, + { + "epoch": 1.3381686310063463, + "grad_norm": 0.8093275121607648, + "learning_rate": 7.334512364722059e-05, + "loss": 0.9165, + "step": 2952 + }, + { + "epoch": 1.3386219401631914, + "grad_norm": 0.7854584158925971, + "learning_rate": 7.333813020213295e-05, + "loss": 0.8754, + "step": 2953 + }, + { + "epoch": 1.3390752493200362, + "grad_norm": 0.7794041815683792, + "learning_rate": 7.333113341813362e-05, + "loss": 0.8855, + "step": 2954 + }, + { + "epoch": 1.3395285584768812, + "grad_norm": 0.8892802675021511, + "learning_rate": 7.33241332959233e-05, + "loss": 0.907, + "step": 2955 + }, + { + "epoch": 1.339981867633726, + "grad_norm": 1.0864284786286493, + "learning_rate": 7.331712983620308e-05, + "loss": 0.9032, + "step": 2956 + }, + { + "epoch": 1.3404351767905711, + "grad_norm": 1.1793224887564477, + "learning_rate": 7.33101230396744e-05, + "loss": 0.902, + "step": 2957 + }, + { + "epoch": 1.3408884859474162, + "grad_norm": 0.8142491240260086, + "learning_rate": 7.330311290703901e-05, + "loss": 0.8855, + "step": 2958 + }, + { + "epoch": 1.341341795104261, + "grad_norm": 0.8129179619106234, + "learning_rate": 7.329609943899897e-05, + "loss": 0.8926, + "step": 2959 + }, + { + "epoch": 1.341795104261106, + "grad_norm": 0.848393386834815, + "learning_rate": 7.328908263625673e-05, + "loss": 0.8936, + "step": 2960 + }, + { + "epoch": 1.342248413417951, + "grad_norm": 1.0764301336449005, + "learning_rate": 7.328206249951502e-05, + "loss": 0.9089, + "step": 2961 + }, + { + "epoch": 1.342701722574796, + "grad_norm": 1.1766561040214507, + "learning_rate": 7.327503902947695e-05, + "loss": 0.8967, + "step": 2962 + }, + { + "epoch": 1.343155031731641, + "grad_norm": 0.7618794012748528, + "learning_rate": 7.326801222684591e-05, + "loss": 0.9031, + "step": 2963 + }, + { + "epoch": 1.343608340888486, + "grad_norm": 0.5932487548761755, + "learning_rate": 7.326098209232568e-05, + "loss": 0.8976, + "step": 2964 + }, + { + "epoch": 1.3440616500453308, + "grad_norm": 0.5494601822471082, + "learning_rate": 7.325394862662034e-05, + "loss": 0.8969, + "step": 2965 + }, + { + "epoch": 1.344514959202176, + "grad_norm": 0.536326972056143, + "learning_rate": 7.324691183043432e-05, + "loss": 0.8897, + "step": 2966 + }, + { + "epoch": 1.344968268359021, + "grad_norm": 0.5173814225468397, + "learning_rate": 7.323987170447237e-05, + "loss": 0.8844, + "step": 2967 + }, + { + "epoch": 1.3454215775158658, + "grad_norm": 0.6087731608555739, + "learning_rate": 7.323282824943957e-05, + "loss": 0.9057, + "step": 2968 + }, + { + "epoch": 1.3458748866727108, + "grad_norm": 0.4815626399967318, + "learning_rate": 7.322578146604135e-05, + "loss": 0.9042, + "step": 2969 + }, + { + "epoch": 1.3463281958295559, + "grad_norm": 0.6092501507444624, + "learning_rate": 7.321873135498348e-05, + "loss": 0.8892, + "step": 2970 + }, + { + "epoch": 1.3467815049864007, + "grad_norm": 0.8397458605222651, + "learning_rate": 7.321167791697203e-05, + "loss": 0.9106, + "step": 2971 + }, + { + "epoch": 1.3472348141432458, + "grad_norm": 1.185258525730492, + "learning_rate": 7.320462115271342e-05, + "loss": 0.9077, + "step": 2972 + }, + { + "epoch": 1.3476881233000906, + "grad_norm": 1.187085139929021, + "learning_rate": 7.319756106291443e-05, + "loss": 0.907, + "step": 2973 + }, + { + "epoch": 1.3481414324569356, + "grad_norm": 0.8844382191121565, + "learning_rate": 7.319049764828213e-05, + "loss": 0.9025, + "step": 2974 + }, + { + "epoch": 1.3485947416137807, + "grad_norm": 0.7542142510620641, + "learning_rate": 7.318343090952393e-05, + "loss": 0.9232, + "step": 2975 + }, + { + "epoch": 1.3490480507706255, + "grad_norm": 0.6821706312483432, + "learning_rate": 7.317636084734763e-05, + "loss": 0.9271, + "step": 2976 + }, + { + "epoch": 1.3495013599274706, + "grad_norm": 0.7835920755661802, + "learning_rate": 7.316928746246125e-05, + "loss": 0.895, + "step": 2977 + }, + { + "epoch": 1.3499546690843154, + "grad_norm": 0.8861972417474347, + "learning_rate": 7.316221075557327e-05, + "loss": 0.8902, + "step": 2978 + }, + { + "epoch": 1.3504079782411604, + "grad_norm": 0.8355793320663961, + "learning_rate": 7.315513072739243e-05, + "loss": 0.8933, + "step": 2979 + }, + { + "epoch": 1.3508612873980055, + "grad_norm": 0.7278685855416024, + "learning_rate": 7.31480473786278e-05, + "loss": 0.8946, + "step": 2980 + }, + { + "epoch": 1.3513145965548503, + "grad_norm": 0.9288308013285924, + "learning_rate": 7.31409607099888e-05, + "loss": 0.9114, + "step": 2981 + }, + { + "epoch": 1.3517679057116954, + "grad_norm": 1.065770517019232, + "learning_rate": 7.313387072218518e-05, + "loss": 0.9091, + "step": 2982 + }, + { + "epoch": 1.3522212148685404, + "grad_norm": 0.8387982656427078, + "learning_rate": 7.312677741592703e-05, + "loss": 0.8929, + "step": 2983 + }, + { + "epoch": 1.3526745240253852, + "grad_norm": 0.8368787365083383, + "learning_rate": 7.311968079192478e-05, + "loss": 0.9099, + "step": 2984 + }, + { + "epoch": 1.3531278331822303, + "grad_norm": 0.9964965018896457, + "learning_rate": 7.311258085088914e-05, + "loss": 0.9064, + "step": 2985 + }, + { + "epoch": 1.3535811423390753, + "grad_norm": 1.2223946955128178, + "learning_rate": 7.310547759353122e-05, + "loss": 0.9076, + "step": 2986 + }, + { + "epoch": 1.3540344514959202, + "grad_norm": 0.8855812923490253, + "learning_rate": 7.309837102056243e-05, + "loss": 0.9131, + "step": 2987 + }, + { + "epoch": 1.3544877606527652, + "grad_norm": 0.917047765129397, + "learning_rate": 7.309126113269451e-05, + "loss": 0.9149, + "step": 2988 + }, + { + "epoch": 1.3549410698096103, + "grad_norm": 2.255240478673682, + "learning_rate": 7.308414793063952e-05, + "loss": 0.9353, + "step": 2989 + }, + { + "epoch": 1.355394378966455, + "grad_norm": 1.0842003962060545, + "learning_rate": 7.307703141510989e-05, + "loss": 0.9013, + "step": 2990 + }, + { + "epoch": 1.3558476881233001, + "grad_norm": 2.6023703629260986, + "learning_rate": 7.306991158681835e-05, + "loss": 0.923, + "step": 2991 + }, + { + "epoch": 1.3563009972801452, + "grad_norm": 2.139635632024924, + "learning_rate": 7.306278844647797e-05, + "loss": 0.9273, + "step": 2992 + }, + { + "epoch": 1.35675430643699, + "grad_norm": 1.837525496490951, + "learning_rate": 7.305566199480217e-05, + "loss": 0.9002, + "step": 2993 + }, + { + "epoch": 1.357207615593835, + "grad_norm": 1.896821893120221, + "learning_rate": 7.304853223250467e-05, + "loss": 0.8979, + "step": 2994 + }, + { + "epoch": 1.35766092475068, + "grad_norm": 1.811058819131917, + "learning_rate": 7.304139916029953e-05, + "loss": 0.8987, + "step": 2995 + }, + { + "epoch": 1.358114233907525, + "grad_norm": 1.3692618250991744, + "learning_rate": 7.303426277890116e-05, + "loss": 0.8989, + "step": 2996 + }, + { + "epoch": 1.3585675430643698, + "grad_norm": 2.068713287103102, + "learning_rate": 7.302712308902429e-05, + "loss": 0.8993, + "step": 2997 + }, + { + "epoch": 1.3590208522212148, + "grad_norm": 1.7359472751304181, + "learning_rate": 7.301998009138397e-05, + "loss": 0.906, + "step": 2998 + }, + { + "epoch": 1.3594741613780599, + "grad_norm": 1.9592198148083495, + "learning_rate": 7.301283378669562e-05, + "loss": 0.9265, + "step": 2999 + }, + { + "epoch": 1.3599274705349047, + "grad_norm": 1.701003887798315, + "learning_rate": 7.300568417567492e-05, + "loss": 0.9113, + "step": 3000 + }, + { + "epoch": 1.3603807796917498, + "grad_norm": 1.800670562434586, + "learning_rate": 7.299853125903796e-05, + "loss": 0.8912, + "step": 3001 + }, + { + "epoch": 1.3608340888485948, + "grad_norm": 1.4854558266072382, + "learning_rate": 7.299137503750112e-05, + "loss": 0.9052, + "step": 3002 + }, + { + "epoch": 1.3612873980054396, + "grad_norm": 2.1192651319607165, + "learning_rate": 7.298421551178109e-05, + "loss": 0.9345, + "step": 3003 + }, + { + "epoch": 1.3617407071622847, + "grad_norm": 1.6734310262770855, + "learning_rate": 7.297705268259496e-05, + "loss": 0.8981, + "step": 3004 + }, + { + "epoch": 1.3621940163191297, + "grad_norm": 1.9809457987659935, + "learning_rate": 7.296988655066006e-05, + "loss": 0.9149, + "step": 3005 + }, + { + "epoch": 1.3626473254759746, + "grad_norm": 5.009701722759714, + "learning_rate": 7.296271711669415e-05, + "loss": 0.9164, + "step": 3006 + }, + { + "epoch": 1.3631006346328196, + "grad_norm": 0.9918078485361573, + "learning_rate": 7.295554438141523e-05, + "loss": 0.9126, + "step": 3007 + }, + { + "epoch": 1.3635539437896647, + "grad_norm": 2.1188861746390812, + "learning_rate": 7.29483683455417e-05, + "loss": 0.9252, + "step": 3008 + }, + { + "epoch": 1.3640072529465095, + "grad_norm": 7.429519310704249, + "learning_rate": 7.294118900979224e-05, + "loss": 0.9119, + "step": 3009 + }, + { + "epoch": 1.3644605621033545, + "grad_norm": 1.5880366888211854, + "learning_rate": 7.293400637488588e-05, + "loss": 0.9284, + "step": 3010 + }, + { + "epoch": 1.3649138712601996, + "grad_norm": 1.2605915092832833, + "learning_rate": 7.292682044154199e-05, + "loss": 0.9289, + "step": 3011 + }, + { + "epoch": 1.3653671804170444, + "grad_norm": 1.2080984076144865, + "learning_rate": 7.291963121048027e-05, + "loss": 0.9167, + "step": 3012 + }, + { + "epoch": 1.3658204895738895, + "grad_norm": 1.2252550985622144, + "learning_rate": 7.291243868242073e-05, + "loss": 0.914, + "step": 3013 + }, + { + "epoch": 1.3662737987307343, + "grad_norm": 1.1351607849720324, + "learning_rate": 7.290524285808375e-05, + "loss": 0.9068, + "step": 3014 + }, + { + "epoch": 1.3667271078875793, + "grad_norm": 1.0918357622737496, + "learning_rate": 7.289804373818996e-05, + "loss": 0.9169, + "step": 3015 + }, + { + "epoch": 1.3671804170444242, + "grad_norm": 1.1556314475864793, + "learning_rate": 7.289084132346042e-05, + "loss": 0.9109, + "step": 3016 + }, + { + "epoch": 1.3676337262012692, + "grad_norm": 1.0106086988278864, + "learning_rate": 7.288363561461645e-05, + "loss": 0.9139, + "step": 3017 + }, + { + "epoch": 1.3680870353581143, + "grad_norm": 1.4768104474611372, + "learning_rate": 7.287642661237974e-05, + "loss": 0.8834, + "step": 3018 + }, + { + "epoch": 1.368540344514959, + "grad_norm": 0.8897538142167286, + "learning_rate": 7.286921431747228e-05, + "loss": 0.9021, + "step": 3019 + }, + { + "epoch": 1.3689936536718041, + "grad_norm": 1.7082642205206588, + "learning_rate": 7.286199873061639e-05, + "loss": 0.8941, + "step": 3020 + }, + { + "epoch": 1.3694469628286492, + "grad_norm": 1.262854132244431, + "learning_rate": 7.285477985253477e-05, + "loss": 0.9188, + "step": 3021 + }, + { + "epoch": 1.369900271985494, + "grad_norm": 1.6166268863048596, + "learning_rate": 7.284755768395037e-05, + "loss": 0.9024, + "step": 3022 + }, + { + "epoch": 1.370353581142339, + "grad_norm": 1.3664269984022732, + "learning_rate": 7.284033222558656e-05, + "loss": 0.9153, + "step": 3023 + }, + { + "epoch": 1.3708068902991841, + "grad_norm": 1.4970548845893055, + "learning_rate": 7.283310347816694e-05, + "loss": 0.9045, + "step": 3024 + }, + { + "epoch": 1.371260199456029, + "grad_norm": 1.2347493884849379, + "learning_rate": 7.28258714424155e-05, + "loss": 0.906, + "step": 3025 + }, + { + "epoch": 1.371713508612874, + "grad_norm": 1.4258743208394589, + "learning_rate": 7.281863611905659e-05, + "loss": 0.9321, + "step": 3026 + }, + { + "epoch": 1.372166817769719, + "grad_norm": 1.066822734220586, + "learning_rate": 7.28113975088148e-05, + "loss": 0.9109, + "step": 3027 + }, + { + "epoch": 1.3726201269265639, + "grad_norm": 1.4779377802135993, + "learning_rate": 7.280415561241513e-05, + "loss": 0.908, + "step": 3028 + }, + { + "epoch": 1.373073436083409, + "grad_norm": 1.1846982231679368, + "learning_rate": 7.279691043058285e-05, + "loss": 0.9114, + "step": 3029 + }, + { + "epoch": 1.373526745240254, + "grad_norm": 1.419631479888708, + "learning_rate": 7.278966196404361e-05, + "loss": 0.9037, + "step": 3030 + }, + { + "epoch": 1.3739800543970988, + "grad_norm": 1.2187336108531548, + "learning_rate": 7.278241021352337e-05, + "loss": 0.9173, + "step": 3031 + }, + { + "epoch": 1.3744333635539439, + "grad_norm": 1.299540639552505, + "learning_rate": 7.277515517974839e-05, + "loss": 0.8787, + "step": 3032 + }, + { + "epoch": 1.3748866727107887, + "grad_norm": 1.121264866388586, + "learning_rate": 7.276789686344529e-05, + "loss": 0.9306, + "step": 3033 + }, + { + "epoch": 1.3753399818676337, + "grad_norm": 1.354883664535278, + "learning_rate": 7.276063526534102e-05, + "loss": 0.9106, + "step": 3034 + }, + { + "epoch": 1.3757932910244786, + "grad_norm": 1.046042925375835, + "learning_rate": 7.275337038616285e-05, + "loss": 0.9224, + "step": 3035 + }, + { + "epoch": 1.3762466001813236, + "grad_norm": 1.334429141434855, + "learning_rate": 7.274610222663838e-05, + "loss": 0.9073, + "step": 3036 + }, + { + "epoch": 1.3766999093381687, + "grad_norm": 1.0528046496888825, + "learning_rate": 7.273883078749551e-05, + "loss": 0.9061, + "step": 3037 + }, + { + "epoch": 1.3771532184950135, + "grad_norm": 1.2729576661418813, + "learning_rate": 7.273155606946255e-05, + "loss": 0.9031, + "step": 3038 + }, + { + "epoch": 1.3776065276518585, + "grad_norm": 0.9961293793758205, + "learning_rate": 7.272427807326803e-05, + "loss": 0.9119, + "step": 3039 + }, + { + "epoch": 1.3780598368087036, + "grad_norm": 1.3631861061418162, + "learning_rate": 7.271699679964089e-05, + "loss": 0.9295, + "step": 3040 + }, + { + "epoch": 1.3785131459655484, + "grad_norm": 1.1341420000256996, + "learning_rate": 7.270971224931038e-05, + "loss": 0.8984, + "step": 3041 + }, + { + "epoch": 1.3789664551223935, + "grad_norm": 1.2793303006413912, + "learning_rate": 7.270242442300604e-05, + "loss": 0.9097, + "step": 3042 + }, + { + "epoch": 1.3794197642792385, + "grad_norm": 1.4908571683886942, + "learning_rate": 7.26951333214578e-05, + "loss": 0.8988, + "step": 3043 + }, + { + "epoch": 1.3798730734360833, + "grad_norm": 0.8021902904038337, + "learning_rate": 7.268783894539585e-05, + "loss": 0.913, + "step": 3044 + }, + { + "epoch": 1.3803263825929284, + "grad_norm": 1.0067133964596853, + "learning_rate": 7.268054129555078e-05, + "loss": 0.8995, + "step": 3045 + }, + { + "epoch": 1.3807796917497734, + "grad_norm": 1.0130872285722223, + "learning_rate": 7.267324037265344e-05, + "loss": 0.926, + "step": 3046 + }, + { + "epoch": 1.3812330009066183, + "grad_norm": 0.9092400842347191, + "learning_rate": 7.266593617743505e-05, + "loss": 0.8961, + "step": 3047 + }, + { + "epoch": 1.3816863100634633, + "grad_norm": 0.8511664865933317, + "learning_rate": 7.265862871062716e-05, + "loss": 0.8954, + "step": 3048 + }, + { + "epoch": 1.3821396192203084, + "grad_norm": 1.0251253404998417, + "learning_rate": 7.265131797296162e-05, + "loss": 0.9224, + "step": 3049 + }, + { + "epoch": 1.3825929283771532, + "grad_norm": 0.8387771611555325, + "learning_rate": 7.264400396517062e-05, + "loss": 0.89, + "step": 3050 + }, + { + "epoch": 1.3830462375339982, + "grad_norm": 0.5994310370173597, + "learning_rate": 7.263668668798669e-05, + "loss": 0.9237, + "step": 3051 + }, + { + "epoch": 1.383499546690843, + "grad_norm": 0.7536322555118232, + "learning_rate": 7.262936614214266e-05, + "loss": 0.9034, + "step": 3052 + }, + { + "epoch": 1.3839528558476881, + "grad_norm": 0.8672364962983329, + "learning_rate": 7.262204232837173e-05, + "loss": 0.8999, + "step": 3053 + }, + { + "epoch": 1.3844061650045332, + "grad_norm": 0.7078909294873459, + "learning_rate": 7.261471524740737e-05, + "loss": 0.9009, + "step": 3054 + }, + { + "epoch": 1.384859474161378, + "grad_norm": 0.6624521564241039, + "learning_rate": 7.260738489998343e-05, + "loss": 0.9068, + "step": 3055 + }, + { + "epoch": 1.385312783318223, + "grad_norm": 0.6736424167268686, + "learning_rate": 7.260005128683408e-05, + "loss": 0.8997, + "step": 3056 + }, + { + "epoch": 1.3857660924750679, + "grad_norm": 0.6567537795734767, + "learning_rate": 7.259271440869377e-05, + "loss": 0.8763, + "step": 3057 + }, + { + "epoch": 1.386219401631913, + "grad_norm": 0.6200385071358802, + "learning_rate": 7.258537426629731e-05, + "loss": 0.933, + "step": 3058 + }, + { + "epoch": 1.386672710788758, + "grad_norm": 0.5923664102131942, + "learning_rate": 7.257803086037987e-05, + "loss": 0.8953, + "step": 3059 + }, + { + "epoch": 1.3871260199456028, + "grad_norm": 0.6081346986493755, + "learning_rate": 7.257068419167688e-05, + "loss": 0.8961, + "step": 3060 + }, + { + "epoch": 1.3875793291024479, + "grad_norm": 0.7177473237668153, + "learning_rate": 7.256333426092415e-05, + "loss": 0.9119, + "step": 3061 + }, + { + "epoch": 1.388032638259293, + "grad_norm": 0.7299519832244515, + "learning_rate": 7.25559810688578e-05, + "loss": 0.8844, + "step": 3062 + }, + { + "epoch": 1.3884859474161377, + "grad_norm": 0.7917084571358092, + "learning_rate": 7.254862461621426e-05, + "loss": 0.889, + "step": 3063 + }, + { + "epoch": 1.3889392565729828, + "grad_norm": 0.8998875446893657, + "learning_rate": 7.254126490373031e-05, + "loss": 0.8883, + "step": 3064 + }, + { + "epoch": 1.3893925657298278, + "grad_norm": 1.0313069440224822, + "learning_rate": 7.253390193214303e-05, + "loss": 0.8929, + "step": 3065 + }, + { + "epoch": 1.3898458748866727, + "grad_norm": 1.0859954300644439, + "learning_rate": 7.252653570218986e-05, + "loss": 0.8955, + "step": 3066 + }, + { + "epoch": 1.3902991840435177, + "grad_norm": 1.0342982364540796, + "learning_rate": 7.251916621460855e-05, + "loss": 0.9122, + "step": 3067 + }, + { + "epoch": 1.3907524932003628, + "grad_norm": 0.8757931360518354, + "learning_rate": 7.251179347013717e-05, + "loss": 0.8959, + "step": 3068 + }, + { + "epoch": 1.3912058023572076, + "grad_norm": 0.7133227107166102, + "learning_rate": 7.250441746951413e-05, + "loss": 0.9133, + "step": 3069 + }, + { + "epoch": 1.3916591115140526, + "grad_norm": 0.6248329769759586, + "learning_rate": 7.249703821347815e-05, + "loss": 0.8801, + "step": 3070 + }, + { + "epoch": 1.3921124206708977, + "grad_norm": 0.5536182371402131, + "learning_rate": 7.248965570276828e-05, + "loss": 0.9045, + "step": 3071 + }, + { + "epoch": 1.3925657298277425, + "grad_norm": 0.5692936482609597, + "learning_rate": 7.24822699381239e-05, + "loss": 0.9088, + "step": 3072 + }, + { + "epoch": 1.3930190389845876, + "grad_norm": 0.9167642309742859, + "learning_rate": 7.247488092028473e-05, + "loss": 0.9179, + "step": 3073 + }, + { + "epoch": 1.3934723481414324, + "grad_norm": 0.5224619633356817, + "learning_rate": 7.24674886499908e-05, + "loss": 0.8899, + "step": 3074 + }, + { + "epoch": 1.3939256572982774, + "grad_norm": 0.633510493815836, + "learning_rate": 7.246009312798245e-05, + "loss": 0.9197, + "step": 3075 + }, + { + "epoch": 1.3943789664551223, + "grad_norm": 0.8054704476527327, + "learning_rate": 7.24526943550004e-05, + "loss": 0.9035, + "step": 3076 + }, + { + "epoch": 1.3948322756119673, + "grad_norm": 1.0098177871806395, + "learning_rate": 7.24452923317856e-05, + "loss": 0.8709, + "step": 3077 + }, + { + "epoch": 1.3952855847688124, + "grad_norm": 1.2511633827478135, + "learning_rate": 7.243788705907943e-05, + "loss": 0.8926, + "step": 3078 + }, + { + "epoch": 1.3957388939256572, + "grad_norm": 0.7971036572323551, + "learning_rate": 7.243047853762355e-05, + "loss": 0.9211, + "step": 3079 + }, + { + "epoch": 1.3961922030825022, + "grad_norm": 0.5537317967039815, + "learning_rate": 7.242306676815991e-05, + "loss": 0.9102, + "step": 3080 + }, + { + "epoch": 1.3966455122393473, + "grad_norm": 0.5269724859747938, + "learning_rate": 7.241565175143086e-05, + "loss": 0.89, + "step": 3081 + }, + { + "epoch": 1.3970988213961921, + "grad_norm": 0.5996892176168334, + "learning_rate": 7.240823348817902e-05, + "loss": 0.9148, + "step": 3082 + }, + { + "epoch": 1.3975521305530372, + "grad_norm": 0.8733090915462135, + "learning_rate": 7.240081197914734e-05, + "loss": 0.9068, + "step": 3083 + }, + { + "epoch": 1.3980054397098822, + "grad_norm": 1.2711716456994768, + "learning_rate": 7.239338722507913e-05, + "loss": 0.9031, + "step": 3084 + }, + { + "epoch": 1.398458748866727, + "grad_norm": 0.8335461743170226, + "learning_rate": 7.238595922671797e-05, + "loss": 0.9027, + "step": 3085 + }, + { + "epoch": 1.398912058023572, + "grad_norm": 0.7675453544376697, + "learning_rate": 7.237852798480783e-05, + "loss": 0.8887, + "step": 3086 + }, + { + "epoch": 1.3993653671804172, + "grad_norm": 0.6585875448478352, + "learning_rate": 7.237109350009294e-05, + "loss": 0.9003, + "step": 3087 + }, + { + "epoch": 1.399818676337262, + "grad_norm": 0.4869195192126909, + "learning_rate": 7.236365577331792e-05, + "loss": 0.8909, + "step": 3088 + }, + { + "epoch": 1.400271985494107, + "grad_norm": 0.6857896700898414, + "learning_rate": 7.235621480522764e-05, + "loss": 0.9192, + "step": 3089 + }, + { + "epoch": 1.400725294650952, + "grad_norm": 0.6425198597134059, + "learning_rate": 7.234877059656736e-05, + "loss": 0.8732, + "step": 3090 + }, + { + "epoch": 1.401178603807797, + "grad_norm": 0.7328659367472543, + "learning_rate": 7.234132314808262e-05, + "loss": 0.9254, + "step": 3091 + }, + { + "epoch": 1.401631912964642, + "grad_norm": 1.0270530081731022, + "learning_rate": 7.233387246051933e-05, + "loss": 0.8932, + "step": 3092 + }, + { + "epoch": 1.4020852221214868, + "grad_norm": 1.0668215116337025, + "learning_rate": 7.232641853462369e-05, + "loss": 0.8771, + "step": 3093 + }, + { + "epoch": 1.4025385312783318, + "grad_norm": 1.0170912868070527, + "learning_rate": 7.231896137114222e-05, + "loss": 0.8894, + "step": 3094 + }, + { + "epoch": 1.4029918404351767, + "grad_norm": 1.1977937632125089, + "learning_rate": 7.231150097082178e-05, + "loss": 0.8919, + "step": 3095 + }, + { + "epoch": 1.4034451495920217, + "grad_norm": 0.8483567895237624, + "learning_rate": 7.230403733440958e-05, + "loss": 0.9268, + "step": 3096 + }, + { + "epoch": 1.4038984587488668, + "grad_norm": 0.7031943523012035, + "learning_rate": 7.229657046265308e-05, + "loss": 0.9045, + "step": 3097 + }, + { + "epoch": 1.4043517679057116, + "grad_norm": 0.6536657887081309, + "learning_rate": 7.228910035630013e-05, + "loss": 0.8866, + "step": 3098 + }, + { + "epoch": 1.4048050770625566, + "grad_norm": 0.5330166822446128, + "learning_rate": 7.22816270160989e-05, + "loss": 0.9069, + "step": 3099 + }, + { + "epoch": 1.4052583862194017, + "grad_norm": 0.4971050452777292, + "learning_rate": 7.227415044279783e-05, + "loss": 0.8982, + "step": 3100 + }, + { + "epoch": 1.4057116953762465, + "grad_norm": 0.5106615065069346, + "learning_rate": 7.226667063714574e-05, + "loss": 0.9105, + "step": 3101 + }, + { + "epoch": 1.4061650045330916, + "grad_norm": 0.47603904527679813, + "learning_rate": 7.225918759989178e-05, + "loss": 0.8948, + "step": 3102 + }, + { + "epoch": 1.4066183136899366, + "grad_norm": 0.5543118502077504, + "learning_rate": 7.225170133178535e-05, + "loss": 0.9104, + "step": 3103 + }, + { + "epoch": 1.4070716228467814, + "grad_norm": 0.5874613956526613, + "learning_rate": 7.224421183357626e-05, + "loss": 0.9136, + "step": 3104 + }, + { + "epoch": 1.4075249320036265, + "grad_norm": 0.5599790011592388, + "learning_rate": 7.22367191060146e-05, + "loss": 0.8745, + "step": 3105 + }, + { + "epoch": 1.4079782411604715, + "grad_norm": 0.5947127328039707, + "learning_rate": 7.222922314985076e-05, + "loss": 0.9092, + "step": 3106 + }, + { + "epoch": 1.4084315503173164, + "grad_norm": 0.7564252044594217, + "learning_rate": 7.222172396583551e-05, + "loss": 0.9003, + "step": 3107 + }, + { + "epoch": 1.4088848594741614, + "grad_norm": 1.0036329399774147, + "learning_rate": 7.22142215547199e-05, + "loss": 0.8833, + "step": 3108 + }, + { + "epoch": 1.4093381686310065, + "grad_norm": 1.4602130310624974, + "learning_rate": 7.220671591725532e-05, + "loss": 0.9, + "step": 3109 + }, + { + "epoch": 1.4097914777878513, + "grad_norm": 0.5295598037168637, + "learning_rate": 7.21992070541935e-05, + "loss": 0.9137, + "step": 3110 + }, + { + "epoch": 1.4102447869446963, + "grad_norm": 0.8704238297532954, + "learning_rate": 7.219169496628645e-05, + "loss": 0.8934, + "step": 3111 + }, + { + "epoch": 1.4106980961015412, + "grad_norm": 1.6719874280270053, + "learning_rate": 7.218417965428655e-05, + "loss": 0.9198, + "step": 3112 + }, + { + "epoch": 1.4111514052583862, + "grad_norm": 0.5680983947951569, + "learning_rate": 7.217666111894649e-05, + "loss": 0.8993, + "step": 3113 + }, + { + "epoch": 1.411604714415231, + "grad_norm": 1.6069087014303025, + "learning_rate": 7.216913936101922e-05, + "loss": 0.9354, + "step": 3114 + }, + { + "epoch": 1.412058023572076, + "grad_norm": 0.9298052138242832, + "learning_rate": 7.216161438125811e-05, + "loss": 0.8977, + "step": 3115 + }, + { + "epoch": 1.4125113327289212, + "grad_norm": 1.2467383760289377, + "learning_rate": 7.21540861804168e-05, + "loss": 0.8917, + "step": 3116 + }, + { + "epoch": 1.412964641885766, + "grad_norm": 0.8141786047503973, + "learning_rate": 7.214655475924926e-05, + "loss": 0.8894, + "step": 3117 + }, + { + "epoch": 1.413417951042611, + "grad_norm": 1.5091750770171173, + "learning_rate": 7.213902011850979e-05, + "loss": 0.9013, + "step": 3118 + }, + { + "epoch": 1.413871260199456, + "grad_norm": 0.6256539720341481, + "learning_rate": 7.213148225895298e-05, + "loss": 0.8975, + "step": 3119 + }, + { + "epoch": 1.414324569356301, + "grad_norm": 1.4809233233811945, + "learning_rate": 7.212394118133381e-05, + "loss": 0.9097, + "step": 3120 + }, + { + "epoch": 1.414777878513146, + "grad_norm": 0.840386050832741, + "learning_rate": 7.211639688640753e-05, + "loss": 0.9015, + "step": 3121 + }, + { + "epoch": 1.415231187669991, + "grad_norm": 1.2263624245932374, + "learning_rate": 7.210884937492968e-05, + "loss": 0.8951, + "step": 3122 + }, + { + "epoch": 1.4156844968268358, + "grad_norm": 1.0726380072006712, + "learning_rate": 7.210129864765621e-05, + "loss": 0.908, + "step": 3123 + }, + { + "epoch": 1.4161378059836809, + "grad_norm": 1.0639714835415737, + "learning_rate": 7.209374470534335e-05, + "loss": 0.8914, + "step": 3124 + }, + { + "epoch": 1.416591115140526, + "grad_norm": 0.9118421369173038, + "learning_rate": 7.208618754874762e-05, + "loss": 0.9198, + "step": 3125 + }, + { + "epoch": 1.4170444242973708, + "grad_norm": 1.0432771521492963, + "learning_rate": 7.207862717862591e-05, + "loss": 0.9144, + "step": 3126 + }, + { + "epoch": 1.4174977334542158, + "grad_norm": 0.8854055089944401, + "learning_rate": 7.207106359573543e-05, + "loss": 0.9022, + "step": 3127 + }, + { + "epoch": 1.4179510426110609, + "grad_norm": 1.129746629666448, + "learning_rate": 7.206349680083366e-05, + "loss": 0.9237, + "step": 3128 + }, + { + "epoch": 1.4184043517679057, + "grad_norm": 0.8110764572595213, + "learning_rate": 7.205592679467845e-05, + "loss": 0.9138, + "step": 3129 + }, + { + "epoch": 1.4188576609247507, + "grad_norm": 0.913628356727519, + "learning_rate": 7.204835357802797e-05, + "loss": 0.9049, + "step": 3130 + }, + { + "epoch": 1.4193109700815956, + "grad_norm": 0.8587956730111521, + "learning_rate": 7.204077715164068e-05, + "loss": 0.9059, + "step": 3131 + }, + { + "epoch": 1.4197642792384406, + "grad_norm": 0.8200727081282149, + "learning_rate": 7.20331975162754e-05, + "loss": 0.8994, + "step": 3132 + }, + { + "epoch": 1.4202175883952854, + "grad_norm": 0.710021777168251, + "learning_rate": 7.202561467269125e-05, + "loss": 0.9069, + "step": 3133 + }, + { + "epoch": 1.4206708975521305, + "grad_norm": 0.6298889417491629, + "learning_rate": 7.201802862164766e-05, + "loss": 0.8953, + "step": 3134 + }, + { + "epoch": 1.4211242067089755, + "grad_norm": 0.7196142419338882, + "learning_rate": 7.201043936390441e-05, + "loss": 0.9065, + "step": 3135 + }, + { + "epoch": 1.4215775158658204, + "grad_norm": 0.8268670517150054, + "learning_rate": 7.200284690022158e-05, + "loss": 0.8808, + "step": 3136 + }, + { + "epoch": 1.4220308250226654, + "grad_norm": 1.0723662199491526, + "learning_rate": 7.199525123135957e-05, + "loss": 0.9024, + "step": 3137 + }, + { + "epoch": 1.4224841341795105, + "grad_norm": 1.1168430977205495, + "learning_rate": 7.198765235807912e-05, + "loss": 0.8987, + "step": 3138 + }, + { + "epoch": 1.4229374433363553, + "grad_norm": 0.9192920649772975, + "learning_rate": 7.198005028114128e-05, + "loss": 0.9095, + "step": 3139 + }, + { + "epoch": 1.4233907524932004, + "grad_norm": 0.9078014576672288, + "learning_rate": 7.197244500130742e-05, + "loss": 0.9187, + "step": 3140 + }, + { + "epoch": 1.4238440616500454, + "grad_norm": 0.9370924759449211, + "learning_rate": 7.196483651933922e-05, + "loss": 0.902, + "step": 3141 + }, + { + "epoch": 1.4242973708068902, + "grad_norm": 1.015949333219389, + "learning_rate": 7.19572248359987e-05, + "loss": 0.8963, + "step": 3142 + }, + { + "epoch": 1.4247506799637353, + "grad_norm": 1.0499215638550403, + "learning_rate": 7.19496099520482e-05, + "loss": 0.8637, + "step": 3143 + }, + { + "epoch": 1.4252039891205803, + "grad_norm": 0.8935859382194378, + "learning_rate": 7.194199186825036e-05, + "loss": 0.9103, + "step": 3144 + }, + { + "epoch": 1.4256572982774252, + "grad_norm": 0.9759864419641953, + "learning_rate": 7.193437058536816e-05, + "loss": 0.9181, + "step": 3145 + }, + { + "epoch": 1.4261106074342702, + "grad_norm": 1.0642759563592108, + "learning_rate": 7.19267461041649e-05, + "loss": 0.8929, + "step": 3146 + }, + { + "epoch": 1.4265639165911153, + "grad_norm": 1.0527131217410866, + "learning_rate": 7.191911842540417e-05, + "loss": 0.8937, + "step": 3147 + }, + { + "epoch": 1.42701722574796, + "grad_norm": 0.8211732886183648, + "learning_rate": 7.191148754984992e-05, + "loss": 0.921, + "step": 3148 + }, + { + "epoch": 1.4274705349048051, + "grad_norm": 0.6110793127379316, + "learning_rate": 7.190385347826642e-05, + "loss": 0.9028, + "step": 3149 + }, + { + "epoch": 1.4279238440616502, + "grad_norm": 0.5701160373580392, + "learning_rate": 7.189621621141823e-05, + "loss": 0.8932, + "step": 3150 + }, + { + "epoch": 1.428377153218495, + "grad_norm": 0.6317923399503657, + "learning_rate": 7.188857575007025e-05, + "loss": 0.8928, + "step": 3151 + }, + { + "epoch": 1.42883046237534, + "grad_norm": 0.6769077689047815, + "learning_rate": 7.188093209498768e-05, + "loss": 0.8929, + "step": 3152 + }, + { + "epoch": 1.4292837715321849, + "grad_norm": 0.807139540451113, + "learning_rate": 7.187328524693605e-05, + "loss": 0.9256, + "step": 3153 + }, + { + "epoch": 1.42973708068903, + "grad_norm": 0.9261470371436863, + "learning_rate": 7.186563520668124e-05, + "loss": 0.8895, + "step": 3154 + }, + { + "epoch": 1.4301903898458748, + "grad_norm": 1.1475835298392403, + "learning_rate": 7.185798197498943e-05, + "loss": 0.911, + "step": 3155 + }, + { + "epoch": 1.4306436990027198, + "grad_norm": 0.9173025483190557, + "learning_rate": 7.185032555262708e-05, + "loss": 0.9103, + "step": 3156 + }, + { + "epoch": 1.4310970081595649, + "grad_norm": 0.8656993318038244, + "learning_rate": 7.184266594036102e-05, + "loss": 0.8976, + "step": 3157 + }, + { + "epoch": 1.4315503173164097, + "grad_norm": 0.9513623036828548, + "learning_rate": 7.183500313895838e-05, + "loss": 0.897, + "step": 3158 + }, + { + "epoch": 1.4320036264732547, + "grad_norm": 1.1639445581592898, + "learning_rate": 7.182733714918662e-05, + "loss": 0.8998, + "step": 3159 + }, + { + "epoch": 1.4324569356300998, + "grad_norm": 0.8347429527317303, + "learning_rate": 7.181966797181351e-05, + "loss": 0.9209, + "step": 3160 + }, + { + "epoch": 1.4329102447869446, + "grad_norm": 0.7343427552230148, + "learning_rate": 7.181199560760712e-05, + "loss": 0.8989, + "step": 3161 + }, + { + "epoch": 1.4333635539437897, + "grad_norm": 0.6176884725847238, + "learning_rate": 7.18043200573359e-05, + "loss": 0.8719, + "step": 3162 + }, + { + "epoch": 1.4338168631006347, + "grad_norm": 0.5016571200979645, + "learning_rate": 7.179664132176853e-05, + "loss": 0.8781, + "step": 3163 + }, + { + "epoch": 1.4342701722574795, + "grad_norm": 0.548159810124031, + "learning_rate": 7.17889594016741e-05, + "loss": 0.9093, + "step": 3164 + }, + { + "epoch": 1.4347234814143246, + "grad_norm": 0.5698110564381629, + "learning_rate": 7.178127429782196e-05, + "loss": 0.8944, + "step": 3165 + }, + { + "epoch": 1.4351767905711696, + "grad_norm": 0.5383089480260332, + "learning_rate": 7.177358601098178e-05, + "loss": 0.8723, + "step": 3166 + }, + { + "epoch": 1.4356300997280145, + "grad_norm": 0.6316127284285209, + "learning_rate": 7.17658945419236e-05, + "loss": 0.8864, + "step": 3167 + }, + { + "epoch": 1.4360834088848595, + "grad_norm": 0.7192284883510054, + "learning_rate": 7.175819989141771e-05, + "loss": 0.8863, + "step": 3168 + }, + { + "epoch": 1.4365367180417046, + "grad_norm": 0.8150562271053579, + "learning_rate": 7.175050206023477e-05, + "loss": 0.9243, + "step": 3169 + }, + { + "epoch": 1.4369900271985494, + "grad_norm": 1.0031799094125031, + "learning_rate": 7.174280104914574e-05, + "loss": 0.9082, + "step": 3170 + }, + { + "epoch": 1.4374433363553945, + "grad_norm": 1.326102589766879, + "learning_rate": 7.173509685892189e-05, + "loss": 0.8982, + "step": 3171 + }, + { + "epoch": 1.4378966455122393, + "grad_norm": 0.6055883263152967, + "learning_rate": 7.172738949033481e-05, + "loss": 0.9008, + "step": 3172 + }, + { + "epoch": 1.4383499546690843, + "grad_norm": 0.4496318131656158, + "learning_rate": 7.171967894415645e-05, + "loss": 0.8873, + "step": 3173 + }, + { + "epoch": 1.4388032638259292, + "grad_norm": 0.8310801017355538, + "learning_rate": 7.1711965221159e-05, + "loss": 0.9015, + "step": 3174 + }, + { + "epoch": 1.4392565729827742, + "grad_norm": 1.3963432452041427, + "learning_rate": 7.170424832211504e-05, + "loss": 0.9098, + "step": 3175 + }, + { + "epoch": 1.4397098821396193, + "grad_norm": 1.1955985845802235, + "learning_rate": 7.169652824779745e-05, + "loss": 0.8918, + "step": 3176 + }, + { + "epoch": 1.440163191296464, + "grad_norm": 0.7161282403571807, + "learning_rate": 7.168880499897939e-05, + "loss": 0.8872, + "step": 3177 + }, + { + "epoch": 1.4406165004533091, + "grad_norm": 1.7793592357349104, + "learning_rate": 7.168107857643437e-05, + "loss": 0.9027, + "step": 3178 + }, + { + "epoch": 1.4410698096101542, + "grad_norm": 0.4592110212062206, + "learning_rate": 7.167334898093623e-05, + "loss": 0.9054, + "step": 3179 + }, + { + "epoch": 1.441523118766999, + "grad_norm": 1.6785592141925805, + "learning_rate": 7.16656162132591e-05, + "loss": 0.906, + "step": 3180 + }, + { + "epoch": 1.441976427923844, + "grad_norm": 0.7854267588153165, + "learning_rate": 7.165788027417744e-05, + "loss": 0.887, + "step": 3181 + }, + { + "epoch": 1.4424297370806891, + "grad_norm": 1.2865546417913956, + "learning_rate": 7.165014116446603e-05, + "loss": 0.8972, + "step": 3182 + }, + { + "epoch": 1.442883046237534, + "grad_norm": 1.3372357114654336, + "learning_rate": 7.164239888489997e-05, + "loss": 0.9048, + "step": 3183 + }, + { + "epoch": 1.443336355394379, + "grad_norm": 1.0213287479972588, + "learning_rate": 7.163465343625468e-05, + "loss": 0.8951, + "step": 3184 + }, + { + "epoch": 1.443789664551224, + "grad_norm": 1.3073463128875316, + "learning_rate": 7.162690481930586e-05, + "loss": 0.9124, + "step": 3185 + }, + { + "epoch": 1.4442429737080689, + "grad_norm": 1.0900747240007347, + "learning_rate": 7.161915303482957e-05, + "loss": 0.9088, + "step": 3186 + }, + { + "epoch": 1.444696282864914, + "grad_norm": 1.475952908792445, + "learning_rate": 7.161139808360218e-05, + "loss": 0.9135, + "step": 3187 + }, + { + "epoch": 1.445149592021759, + "grad_norm": 0.6291658619897117, + "learning_rate": 7.160363996640035e-05, + "loss": 0.912, + "step": 3188 + }, + { + "epoch": 1.4456029011786038, + "grad_norm": 1.6834802229248673, + "learning_rate": 7.159587868400112e-05, + "loss": 0.8946, + "step": 3189 + }, + { + "epoch": 1.4460562103354488, + "grad_norm": 0.7017333978640705, + "learning_rate": 7.158811423718177e-05, + "loss": 0.8983, + "step": 3190 + }, + { + "epoch": 1.4465095194922937, + "grad_norm": 1.7767886576195875, + "learning_rate": 7.158034662671996e-05, + "loss": 0.9244, + "step": 3191 + }, + { + "epoch": 1.4469628286491387, + "grad_norm": 1.2211575260349345, + "learning_rate": 7.157257585339361e-05, + "loss": 0.8943, + "step": 3192 + }, + { + "epoch": 1.4474161378059835, + "grad_norm": 1.8579214654034095, + "learning_rate": 7.1564801917981e-05, + "loss": 0.8945, + "step": 3193 + }, + { + "epoch": 1.4478694469628286, + "grad_norm": 2.3661718606658324, + "learning_rate": 7.155702482126071e-05, + "loss": 0.9237, + "step": 3194 + }, + { + "epoch": 1.4483227561196736, + "grad_norm": 1.2392845572072022, + "learning_rate": 7.154924456401164e-05, + "loss": 0.9014, + "step": 3195 + }, + { + "epoch": 1.4487760652765185, + "grad_norm": 1.432629339909105, + "learning_rate": 7.1541461147013e-05, + "loss": 0.9143, + "step": 3196 + }, + { + "epoch": 1.4492293744333635, + "grad_norm": 1.0518103473821645, + "learning_rate": 7.153367457104432e-05, + "loss": 0.9019, + "step": 3197 + }, + { + "epoch": 1.4496826835902086, + "grad_norm": 1.5036933885353525, + "learning_rate": 7.152588483688547e-05, + "loss": 0.9116, + "step": 3198 + }, + { + "epoch": 1.4501359927470534, + "grad_norm": 1.2542363061599409, + "learning_rate": 7.151809194531661e-05, + "loss": 0.9115, + "step": 3199 + }, + { + "epoch": 1.4505893019038985, + "grad_norm": 1.3296676135485954, + "learning_rate": 7.15102958971182e-05, + "loss": 0.9011, + "step": 3200 + }, + { + "epoch": 1.4510426110607435, + "grad_norm": 1.1675941773727512, + "learning_rate": 7.150249669307104e-05, + "loss": 0.9049, + "step": 3201 + }, + { + "epoch": 1.4514959202175883, + "grad_norm": 0.9200422286976883, + "learning_rate": 7.149469433395626e-05, + "loss": 0.9076, + "step": 3202 + }, + { + "epoch": 1.4519492293744334, + "grad_norm": 0.9081903997886037, + "learning_rate": 7.148688882055528e-05, + "loss": 0.8947, + "step": 3203 + }, + { + "epoch": 1.4524025385312784, + "grad_norm": 0.7986595306132295, + "learning_rate": 7.147908015364986e-05, + "loss": 0.9023, + "step": 3204 + }, + { + "epoch": 1.4528558476881233, + "grad_norm": 0.8251394491236508, + "learning_rate": 7.147126833402203e-05, + "loss": 0.8927, + "step": 3205 + }, + { + "epoch": 1.4533091568449683, + "grad_norm": 0.7638759674476218, + "learning_rate": 7.146345336245419e-05, + "loss": 0.8928, + "step": 3206 + }, + { + "epoch": 1.4537624660018134, + "grad_norm": 0.6434140766237536, + "learning_rate": 7.145563523972902e-05, + "loss": 0.8767, + "step": 3207 + }, + { + "epoch": 1.4542157751586582, + "grad_norm": 0.6829122819946978, + "learning_rate": 7.144781396662955e-05, + "loss": 0.9007, + "step": 3208 + }, + { + "epoch": 1.4546690843155032, + "grad_norm": 0.7481273119110196, + "learning_rate": 7.143998954393908e-05, + "loss": 0.9101, + "step": 3209 + }, + { + "epoch": 1.455122393472348, + "grad_norm": 0.5772072279549394, + "learning_rate": 7.143216197244124e-05, + "loss": 0.8871, + "step": 3210 + }, + { + "epoch": 1.4555757026291931, + "grad_norm": 0.5543038036613563, + "learning_rate": 7.142433125292003e-05, + "loss": 0.9115, + "step": 3211 + }, + { + "epoch": 1.456029011786038, + "grad_norm": 0.6387594978308473, + "learning_rate": 7.141649738615968e-05, + "loss": 0.895, + "step": 3212 + }, + { + "epoch": 1.456482320942883, + "grad_norm": 0.6522456953706728, + "learning_rate": 7.140866037294478e-05, + "loss": 0.9038, + "step": 3213 + }, + { + "epoch": 1.456935630099728, + "grad_norm": 0.6380839782732546, + "learning_rate": 7.140082021406023e-05, + "loss": 0.91, + "step": 3214 + }, + { + "epoch": 1.4573889392565729, + "grad_norm": 0.7922824024306959, + "learning_rate": 7.139297691029127e-05, + "loss": 0.8907, + "step": 3215 + }, + { + "epoch": 1.457842248413418, + "grad_norm": 0.8003424186443753, + "learning_rate": 7.138513046242338e-05, + "loss": 0.9064, + "step": 3216 + }, + { + "epoch": 1.458295557570263, + "grad_norm": 0.8531337210677881, + "learning_rate": 7.137728087124246e-05, + "loss": 0.907, + "step": 3217 + }, + { + "epoch": 1.4587488667271078, + "grad_norm": 0.9720263921809812, + "learning_rate": 7.136942813753465e-05, + "loss": 0.8985, + "step": 3218 + }, + { + "epoch": 1.4592021758839528, + "grad_norm": 1.0145795529137709, + "learning_rate": 7.13615722620864e-05, + "loss": 0.8933, + "step": 3219 + }, + { + "epoch": 1.459655485040798, + "grad_norm": 1.1646601317062875, + "learning_rate": 7.135371324568453e-05, + "loss": 0.9072, + "step": 3220 + }, + { + "epoch": 1.4601087941976427, + "grad_norm": 0.8450495510048055, + "learning_rate": 7.134585108911612e-05, + "loss": 0.8988, + "step": 3221 + }, + { + "epoch": 1.4605621033544878, + "grad_norm": 0.7412539448104492, + "learning_rate": 7.133798579316861e-05, + "loss": 0.8734, + "step": 3222 + }, + { + "epoch": 1.4610154125113328, + "grad_norm": 0.6949117982014194, + "learning_rate": 7.133011735862971e-05, + "loss": 0.9112, + "step": 3223 + }, + { + "epoch": 1.4614687216681777, + "grad_norm": 0.6314769986260067, + "learning_rate": 7.132224578628749e-05, + "loss": 0.902, + "step": 3224 + }, + { + "epoch": 1.4619220308250227, + "grad_norm": 0.6147014174578541, + "learning_rate": 7.131437107693031e-05, + "loss": 0.9019, + "step": 3225 + }, + { + "epoch": 1.4623753399818678, + "grad_norm": 0.698490774256373, + "learning_rate": 7.130649323134681e-05, + "loss": 0.9046, + "step": 3226 + }, + { + "epoch": 1.4628286491387126, + "grad_norm": 0.7733269183514531, + "learning_rate": 7.1298612250326e-05, + "loss": 0.8955, + "step": 3227 + }, + { + "epoch": 1.4632819582955576, + "grad_norm": 0.8394683642537851, + "learning_rate": 7.129072813465723e-05, + "loss": 0.9097, + "step": 3228 + }, + { + "epoch": 1.4637352674524025, + "grad_norm": 1.0632585038930649, + "learning_rate": 7.128284088513004e-05, + "loss": 0.9105, + "step": 3229 + }, + { + "epoch": 1.4641885766092475, + "grad_norm": 1.1528534055317488, + "learning_rate": 7.127495050253441e-05, + "loss": 0.8946, + "step": 3230 + }, + { + "epoch": 1.4646418857660926, + "grad_norm": 0.8502316711575557, + "learning_rate": 7.126705698766057e-05, + "loss": 0.8919, + "step": 3231 + }, + { + "epoch": 1.4650951949229374, + "grad_norm": 0.7220283649943882, + "learning_rate": 7.125916034129908e-05, + "loss": 0.907, + "step": 3232 + }, + { + "epoch": 1.4655485040797824, + "grad_norm": 0.5939919084429469, + "learning_rate": 7.125126056424082e-05, + "loss": 0.8919, + "step": 3233 + }, + { + "epoch": 1.4660018132366273, + "grad_norm": 0.47246639184105194, + "learning_rate": 7.124335765727696e-05, + "loss": 0.9084, + "step": 3234 + }, + { + "epoch": 1.4664551223934723, + "grad_norm": 0.49061150705493495, + "learning_rate": 7.123545162119901e-05, + "loss": 0.9045, + "step": 3235 + }, + { + "epoch": 1.4669084315503174, + "grad_norm": 0.6783821355823716, + "learning_rate": 7.122754245679877e-05, + "loss": 0.9154, + "step": 3236 + }, + { + "epoch": 1.4673617407071622, + "grad_norm": 1.0047594420961272, + "learning_rate": 7.12196301648684e-05, + "loss": 0.9037, + "step": 3237 + }, + { + "epoch": 1.4678150498640072, + "grad_norm": 1.395469813359491, + "learning_rate": 7.12117147462003e-05, + "loss": 0.8924, + "step": 3238 + }, + { + "epoch": 1.4682683590208523, + "grad_norm": 1.5092807764264908, + "learning_rate": 7.120379620158723e-05, + "loss": 0.9194, + "step": 3239 + }, + { + "epoch": 1.4687216681776971, + "grad_norm": 1.1129504486297903, + "learning_rate": 7.11958745318223e-05, + "loss": 0.8888, + "step": 3240 + }, + { + "epoch": 1.4691749773345422, + "grad_norm": 1.8167220245309383, + "learning_rate": 7.118794973769881e-05, + "loss": 0.9162, + "step": 3241 + }, + { + "epoch": 1.4696282864913872, + "grad_norm": 0.7429717578141442, + "learning_rate": 7.118002182001051e-05, + "loss": 0.9172, + "step": 3242 + }, + { + "epoch": 1.470081595648232, + "grad_norm": 2.506255249944693, + "learning_rate": 7.11720907795514e-05, + "loss": 0.9047, + "step": 3243 + }, + { + "epoch": 1.470534904805077, + "grad_norm": 1.7170270088665711, + "learning_rate": 7.116415661711579e-05, + "loss": 0.8876, + "step": 3244 + }, + { + "epoch": 1.4709882139619221, + "grad_norm": 2.440143087435275, + "learning_rate": 7.115621933349828e-05, + "loss": 0.929, + "step": 3245 + }, + { + "epoch": 1.471441523118767, + "grad_norm": 2.3789029900377345, + "learning_rate": 7.114827892949385e-05, + "loss": 0.9061, + "step": 3246 + }, + { + "epoch": 1.471894832275612, + "grad_norm": 1.2676066597334976, + "learning_rate": 7.114033540589774e-05, + "loss": 0.8941, + "step": 3247 + }, + { + "epoch": 1.472348141432457, + "grad_norm": 1.013288972686442, + "learning_rate": 7.113238876350553e-05, + "loss": 0.8943, + "step": 3248 + }, + { + "epoch": 1.472801450589302, + "grad_norm": 1.9588695326335641, + "learning_rate": 7.112443900311308e-05, + "loss": 0.8988, + "step": 3249 + }, + { + "epoch": 1.473254759746147, + "grad_norm": 1.1508903214619797, + "learning_rate": 7.111648612551659e-05, + "loss": 0.8753, + "step": 3250 + }, + { + "epoch": 1.4737080689029918, + "grad_norm": 2.4509719459795978, + "learning_rate": 7.110853013151255e-05, + "loss": 0.8851, + "step": 3251 + }, + { + "epoch": 1.4741613780598368, + "grad_norm": 2.3735625474918116, + "learning_rate": 7.110057102189782e-05, + "loss": 0.9052, + "step": 3252 + }, + { + "epoch": 1.4746146872166817, + "grad_norm": 1.1612416655957616, + "learning_rate": 7.109260879746948e-05, + "loss": 0.915, + "step": 3253 + }, + { + "epoch": 1.4750679963735267, + "grad_norm": 1.4232112275275395, + "learning_rate": 7.1084643459025e-05, + "loss": 0.9134, + "step": 3254 + }, + { + "epoch": 1.4755213055303718, + "grad_norm": 0.9350642921992706, + "learning_rate": 7.107667500736212e-05, + "loss": 0.9232, + "step": 3255 + }, + { + "epoch": 1.4759746146872166, + "grad_norm": 1.6084797159781334, + "learning_rate": 7.10687034432789e-05, + "loss": 0.9296, + "step": 3256 + }, + { + "epoch": 1.4764279238440616, + "grad_norm": 0.8871146874826087, + "learning_rate": 7.106072876757373e-05, + "loss": 0.9069, + "step": 3257 + }, + { + "epoch": 1.4768812330009067, + "grad_norm": 1.6790697360389144, + "learning_rate": 7.105275098104526e-05, + "loss": 0.9148, + "step": 3258 + }, + { + "epoch": 1.4773345421577515, + "grad_norm": 1.447454186711884, + "learning_rate": 7.104477008449256e-05, + "loss": 0.9188, + "step": 3259 + }, + { + "epoch": 1.4777878513145966, + "grad_norm": 1.3919892557788291, + "learning_rate": 7.103678607871486e-05, + "loss": 0.8956, + "step": 3260 + }, + { + "epoch": 1.4782411604714416, + "grad_norm": 1.6562279076531456, + "learning_rate": 7.102879896451184e-05, + "loss": 0.8984, + "step": 3261 + }, + { + "epoch": 1.4786944696282864, + "grad_norm": 1.3354598441288585, + "learning_rate": 7.102080874268341e-05, + "loss": 0.9049, + "step": 3262 + }, + { + "epoch": 1.4791477787851315, + "grad_norm": 1.8275006566139365, + "learning_rate": 7.10128154140298e-05, + "loss": 0.9046, + "step": 3263 + }, + { + "epoch": 1.4796010879419765, + "grad_norm": 1.597143921726934, + "learning_rate": 7.10048189793516e-05, + "loss": 0.9324, + "step": 3264 + }, + { + "epoch": 1.4800543970988214, + "grad_norm": 1.562355115452566, + "learning_rate": 7.099681943944965e-05, + "loss": 0.9036, + "step": 3265 + }, + { + "epoch": 1.4805077062556664, + "grad_norm": 1.1563081048454444, + "learning_rate": 7.098881679512513e-05, + "loss": 0.9202, + "step": 3266 + }, + { + "epoch": 1.4809610154125115, + "grad_norm": 1.7611516827998022, + "learning_rate": 7.098081104717955e-05, + "loss": 0.9067, + "step": 3267 + }, + { + "epoch": 1.4814143245693563, + "grad_norm": 1.2023912024359766, + "learning_rate": 7.097280219641468e-05, + "loss": 0.9084, + "step": 3268 + }, + { + "epoch": 1.4818676337262013, + "grad_norm": 1.902415957093118, + "learning_rate": 7.096479024363265e-05, + "loss": 0.8984, + "step": 3269 + }, + { + "epoch": 1.4823209428830462, + "grad_norm": 1.443573019419363, + "learning_rate": 7.095677518963586e-05, + "loss": 0.9017, + "step": 3270 + }, + { + "epoch": 1.4827742520398912, + "grad_norm": 1.8032899572081773, + "learning_rate": 7.094875703522707e-05, + "loss": 0.9058, + "step": 3271 + }, + { + "epoch": 1.483227561196736, + "grad_norm": 1.3539517596332717, + "learning_rate": 7.09407357812093e-05, + "loss": 0.9125, + "step": 3272 + }, + { + "epoch": 1.483680870353581, + "grad_norm": 1.835188729033868, + "learning_rate": 7.093271142838589e-05, + "loss": 0.9107, + "step": 3273 + }, + { + "epoch": 1.4841341795104261, + "grad_norm": 1.3677815746687316, + "learning_rate": 7.092468397756054e-05, + "loss": 0.9017, + "step": 3274 + }, + { + "epoch": 1.484587488667271, + "grad_norm": 1.9611693498289229, + "learning_rate": 7.091665342953721e-05, + "loss": 0.8921, + "step": 3275 + }, + { + "epoch": 1.485040797824116, + "grad_norm": 1.5579918630728795, + "learning_rate": 7.090861978512018e-05, + "loss": 0.9229, + "step": 3276 + }, + { + "epoch": 1.485494106980961, + "grad_norm": 1.7941446669276915, + "learning_rate": 7.090058304511402e-05, + "loss": 0.9, + "step": 3277 + }, + { + "epoch": 1.485947416137806, + "grad_norm": 1.4703322264253023, + "learning_rate": 7.089254321032366e-05, + "loss": 0.8917, + "step": 3278 + }, + { + "epoch": 1.486400725294651, + "grad_norm": 1.8397786721283123, + "learning_rate": 7.088450028155431e-05, + "loss": 0.8937, + "step": 3279 + }, + { + "epoch": 1.486854034451496, + "grad_norm": 1.5294071751552087, + "learning_rate": 7.087645425961148e-05, + "loss": 0.9229, + "step": 3280 + }, + { + "epoch": 1.4873073436083408, + "grad_norm": 1.6780856929444898, + "learning_rate": 7.086840514530102e-05, + "loss": 0.9194, + "step": 3281 + }, + { + "epoch": 1.4877606527651859, + "grad_norm": 1.4225281062074104, + "learning_rate": 7.086035293942905e-05, + "loss": 0.9003, + "step": 3282 + }, + { + "epoch": 1.488213961922031, + "grad_norm": 1.6329784025062504, + "learning_rate": 7.085229764280205e-05, + "loss": 0.8972, + "step": 3283 + }, + { + "epoch": 1.4886672710788758, + "grad_norm": 1.3784895870955867, + "learning_rate": 7.084423925622677e-05, + "loss": 0.9046, + "step": 3284 + }, + { + "epoch": 1.4891205802357208, + "grad_norm": 1.5911963236706699, + "learning_rate": 7.083617778051027e-05, + "loss": 0.8913, + "step": 3285 + }, + { + "epoch": 1.4895738893925659, + "grad_norm": 1.3756590800513135, + "learning_rate": 7.082811321645995e-05, + "loss": 0.901, + "step": 3286 + }, + { + "epoch": 1.4900271985494107, + "grad_norm": 1.5117347204825644, + "learning_rate": 7.08200455648835e-05, + "loss": 0.8817, + "step": 3287 + }, + { + "epoch": 1.4904805077062557, + "grad_norm": 1.4644498534901156, + "learning_rate": 7.081197482658887e-05, + "loss": 0.8817, + "step": 3288 + }, + { + "epoch": 1.4909338168631006, + "grad_norm": 1.2506868489199763, + "learning_rate": 7.080390100238443e-05, + "loss": 0.9307, + "step": 3289 + }, + { + "epoch": 1.4913871260199456, + "grad_norm": 1.3043747747048569, + "learning_rate": 7.079582409307877e-05, + "loss": 0.8847, + "step": 3290 + }, + { + "epoch": 1.4918404351767904, + "grad_norm": 0.9792419969841942, + "learning_rate": 7.078774409948083e-05, + "loss": 0.9146, + "step": 3291 + }, + { + "epoch": 1.4922937443336355, + "grad_norm": 1.336746189677506, + "learning_rate": 7.077966102239983e-05, + "loss": 0.893, + "step": 3292 + }, + { + "epoch": 1.4927470534904805, + "grad_norm": 0.8777292877462655, + "learning_rate": 7.077157486264531e-05, + "loss": 0.8889, + "step": 3293 + }, + { + "epoch": 1.4932003626473254, + "grad_norm": 0.8990996081294912, + "learning_rate": 7.076348562102713e-05, + "loss": 0.8967, + "step": 3294 + }, + { + "epoch": 1.4936536718041704, + "grad_norm": 0.75040700245139, + "learning_rate": 7.075539329835547e-05, + "loss": 0.9003, + "step": 3295 + }, + { + "epoch": 1.4941069809610155, + "grad_norm": 0.9123555127305218, + "learning_rate": 7.074729789544077e-05, + "loss": 0.8929, + "step": 3296 + }, + { + "epoch": 1.4945602901178603, + "grad_norm": 0.9509062101615217, + "learning_rate": 7.073919941309383e-05, + "loss": 0.8916, + "step": 3297 + }, + { + "epoch": 1.4950135992747053, + "grad_norm": 0.7505501063977574, + "learning_rate": 7.073109785212572e-05, + "loss": 0.8849, + "step": 3298 + }, + { + "epoch": 1.4954669084315504, + "grad_norm": 0.5655848842881394, + "learning_rate": 7.072299321334783e-05, + "loss": 0.8935, + "step": 3299 + }, + { + "epoch": 1.4959202175883952, + "grad_norm": 0.6540167355922669, + "learning_rate": 7.07148854975719e-05, + "loss": 0.9066, + "step": 3300 + }, + { + "epoch": 1.4963735267452403, + "grad_norm": 0.6202326548469641, + "learning_rate": 7.070677470560991e-05, + "loss": 0.9003, + "step": 3301 + }, + { + "epoch": 1.4968268359020853, + "grad_norm": 0.6556701993820765, + "learning_rate": 7.069866083827418e-05, + "loss": 0.912, + "step": 3302 + }, + { + "epoch": 1.4972801450589301, + "grad_norm": 0.8197576644806988, + "learning_rate": 7.069054389637736e-05, + "loss": 0.9057, + "step": 3303 + }, + { + "epoch": 1.4977334542157752, + "grad_norm": 0.852560579976863, + "learning_rate": 7.068242388073237e-05, + "loss": 0.9042, + "step": 3304 + }, + { + "epoch": 1.4981867633726202, + "grad_norm": 0.6860141325879088, + "learning_rate": 7.067430079215244e-05, + "loss": 0.8942, + "step": 3305 + }, + { + "epoch": 1.498640072529465, + "grad_norm": 0.4526533742470864, + "learning_rate": 7.066617463145114e-05, + "loss": 0.918, + "step": 3306 + }, + { + "epoch": 1.4990933816863101, + "grad_norm": 0.7592486861395077, + "learning_rate": 7.065804539944232e-05, + "loss": 0.8881, + "step": 3307 + }, + { + "epoch": 1.499546690843155, + "grad_norm": 0.7365517170897338, + "learning_rate": 7.064991309694016e-05, + "loss": 0.9101, + "step": 3308 + }, + { + "epoch": 1.5, + "grad_norm": 0.5890823076918769, + "learning_rate": 7.064177772475912e-05, + "loss": 0.8926, + "step": 3309 + }, + { + "epoch": 1.5004533091568448, + "grad_norm": 0.5653683138639732, + "learning_rate": 7.063363928371399e-05, + "loss": 0.9015, + "step": 3310 + }, + { + "epoch": 1.50090661831369, + "grad_norm": 0.5273691032072996, + "learning_rate": 7.062549777461986e-05, + "loss": 0.8938, + "step": 3311 + }, + { + "epoch": 1.501359927470535, + "grad_norm": 0.46541007480786406, + "learning_rate": 7.061735319829211e-05, + "loss": 0.879, + "step": 3312 + }, + { + "epoch": 1.5018132366273798, + "grad_norm": 0.4713838799274754, + "learning_rate": 7.060920555554646e-05, + "loss": 0.8759, + "step": 3313 + }, + { + "epoch": 1.5022665457842248, + "grad_norm": 0.3955417921863602, + "learning_rate": 7.060105484719892e-05, + "loss": 0.8903, + "step": 3314 + }, + { + "epoch": 1.5027198549410699, + "grad_norm": 0.3816685269424007, + "learning_rate": 7.05929010740658e-05, + "loss": 0.8952, + "step": 3315 + }, + { + "epoch": 1.5031731640979147, + "grad_norm": 0.5190419671192998, + "learning_rate": 7.058474423696372e-05, + "loss": 0.9082, + "step": 3316 + }, + { + "epoch": 1.5036264732547597, + "grad_norm": 0.5454633440090965, + "learning_rate": 7.057658433670963e-05, + "loss": 0.9086, + "step": 3317 + }, + { + "epoch": 1.5040797824116048, + "grad_norm": 0.4021834349507755, + "learning_rate": 7.056842137412074e-05, + "loss": 0.899, + "step": 3318 + }, + { + "epoch": 1.5045330915684496, + "grad_norm": 0.42788270520017135, + "learning_rate": 7.056025535001463e-05, + "loss": 0.8997, + "step": 3319 + }, + { + "epoch": 1.5049864007252947, + "grad_norm": 0.39814113805394696, + "learning_rate": 7.055208626520911e-05, + "loss": 0.8956, + "step": 3320 + }, + { + "epoch": 1.5054397098821397, + "grad_norm": 0.4405870168866256, + "learning_rate": 7.054391412052237e-05, + "loss": 0.9059, + "step": 3321 + }, + { + "epoch": 1.5058930190389845, + "grad_norm": 0.5366976988878929, + "learning_rate": 7.053573891677286e-05, + "loss": 0.8991, + "step": 3322 + }, + { + "epoch": 1.5063463281958296, + "grad_norm": 0.5054686870723185, + "learning_rate": 7.052756065477935e-05, + "loss": 0.9139, + "step": 3323 + }, + { + "epoch": 1.5067996373526746, + "grad_norm": 0.43999582393264464, + "learning_rate": 7.051937933536094e-05, + "loss": 0.9084, + "step": 3324 + }, + { + "epoch": 1.5072529465095195, + "grad_norm": 0.4926951676183928, + "learning_rate": 7.051119495933698e-05, + "loss": 0.8976, + "step": 3325 + }, + { + "epoch": 1.5077062556663645, + "grad_norm": 0.6397742928154375, + "learning_rate": 7.050300752752718e-05, + "loss": 0.8915, + "step": 3326 + }, + { + "epoch": 1.5081595648232096, + "grad_norm": 0.8346278706649632, + "learning_rate": 7.049481704075152e-05, + "loss": 0.8957, + "step": 3327 + }, + { + "epoch": 1.5086128739800544, + "grad_norm": 0.9004008048402814, + "learning_rate": 7.048662349983034e-05, + "loss": 0.8894, + "step": 3328 + }, + { + "epoch": 1.5090661831368992, + "grad_norm": 0.9496221999211547, + "learning_rate": 7.047842690558419e-05, + "loss": 0.9081, + "step": 3329 + }, + { + "epoch": 1.5095194922937445, + "grad_norm": 0.9915316006264037, + "learning_rate": 7.0470227258834e-05, + "loss": 0.8813, + "step": 3330 + }, + { + "epoch": 1.5099728014505893, + "grad_norm": 0.9669135163970264, + "learning_rate": 7.046202456040103e-05, + "loss": 0.9077, + "step": 3331 + }, + { + "epoch": 1.5104261106074341, + "grad_norm": 0.8758135203084915, + "learning_rate": 7.045381881110677e-05, + "loss": 0.91, + "step": 3332 + }, + { + "epoch": 1.5108794197642792, + "grad_norm": 0.912517255565864, + "learning_rate": 7.044561001177304e-05, + "loss": 0.9021, + "step": 3333 + }, + { + "epoch": 1.5113327289211242, + "grad_norm": 0.7938574796467526, + "learning_rate": 7.043739816322199e-05, + "loss": 0.9004, + "step": 3334 + }, + { + "epoch": 1.511786038077969, + "grad_norm": 0.688782370051854, + "learning_rate": 7.042918326627607e-05, + "loss": 0.9141, + "step": 3335 + }, + { + "epoch": 1.5122393472348141, + "grad_norm": 0.5326383696137961, + "learning_rate": 7.042096532175801e-05, + "loss": 0.9114, + "step": 3336 + }, + { + "epoch": 1.5126926563916592, + "grad_norm": 0.5241141906092749, + "learning_rate": 7.041274433049087e-05, + "loss": 0.9116, + "step": 3337 + }, + { + "epoch": 1.513145965548504, + "grad_norm": 0.4547977338761953, + "learning_rate": 7.0404520293298e-05, + "loss": 0.8799, + "step": 3338 + }, + { + "epoch": 1.513599274705349, + "grad_norm": 0.3594040940113845, + "learning_rate": 7.039629321100307e-05, + "loss": 0.909, + "step": 3339 + }, + { + "epoch": 1.514052583862194, + "grad_norm": 0.3918782801833048, + "learning_rate": 7.038806308443003e-05, + "loss": 0.8778, + "step": 3340 + }, + { + "epoch": 1.514505893019039, + "grad_norm": 0.48994967263384365, + "learning_rate": 7.037982991440318e-05, + "loss": 0.8936, + "step": 3341 + }, + { + "epoch": 1.514959202175884, + "grad_norm": 0.5697444567085274, + "learning_rate": 7.037159370174706e-05, + "loss": 0.8858, + "step": 3342 + }, + { + "epoch": 1.515412511332729, + "grad_norm": 0.7242146399222581, + "learning_rate": 7.036335444728659e-05, + "loss": 0.9347, + "step": 3343 + }, + { + "epoch": 1.5158658204895739, + "grad_norm": 0.8943945679676468, + "learning_rate": 7.035511215184693e-05, + "loss": 0.8991, + "step": 3344 + }, + { + "epoch": 1.516319129646419, + "grad_norm": 0.9687249309522145, + "learning_rate": 7.034686681625356e-05, + "loss": 0.8921, + "step": 3345 + }, + { + "epoch": 1.516772438803264, + "grad_norm": 1.0235305929131058, + "learning_rate": 7.033861844133231e-05, + "loss": 0.8892, + "step": 3346 + }, + { + "epoch": 1.5172257479601088, + "grad_norm": 1.0673292046505296, + "learning_rate": 7.033036702790926e-05, + "loss": 0.8988, + "step": 3347 + }, + { + "epoch": 1.5176790571169536, + "grad_norm": 0.9591501078290576, + "learning_rate": 7.032211257681081e-05, + "loss": 0.8905, + "step": 3348 + }, + { + "epoch": 1.5181323662737989, + "grad_norm": 0.9874092311163275, + "learning_rate": 7.031385508886366e-05, + "loss": 0.9019, + "step": 3349 + }, + { + "epoch": 1.5185856754306437, + "grad_norm": 1.0717576107381335, + "learning_rate": 7.030559456489485e-05, + "loss": 0.9008, + "step": 3350 + }, + { + "epoch": 1.5190389845874885, + "grad_norm": 0.8679688490971372, + "learning_rate": 7.029733100573166e-05, + "loss": 0.9016, + "step": 3351 + }, + { + "epoch": 1.5194922937443336, + "grad_norm": 0.7520545748859458, + "learning_rate": 7.028906441220175e-05, + "loss": 0.8886, + "step": 3352 + }, + { + "epoch": 1.5199456029011786, + "grad_norm": 0.6107535407183008, + "learning_rate": 7.0280794785133e-05, + "loss": 0.8898, + "step": 3353 + }, + { + "epoch": 1.5203989120580235, + "grad_norm": 0.5336892549567048, + "learning_rate": 7.027252212535366e-05, + "loss": 0.9014, + "step": 3354 + }, + { + "epoch": 1.5208522212148685, + "grad_norm": 0.7317971363348532, + "learning_rate": 7.026424643369226e-05, + "loss": 0.8946, + "step": 3355 + }, + { + "epoch": 1.5213055303717136, + "grad_norm": 1.0016289026426834, + "learning_rate": 7.025596771097765e-05, + "loss": 0.8886, + "step": 3356 + }, + { + "epoch": 1.5217588395285584, + "grad_norm": 1.2878188708118157, + "learning_rate": 7.024768595803894e-05, + "loss": 0.9133, + "step": 3357 + }, + { + "epoch": 1.5222121486854034, + "grad_norm": 0.5252883848090227, + "learning_rate": 7.023940117570557e-05, + "loss": 0.8979, + "step": 3358 + }, + { + "epoch": 1.5226654578422485, + "grad_norm": 0.5276166515097288, + "learning_rate": 7.023111336480733e-05, + "loss": 0.8885, + "step": 3359 + }, + { + "epoch": 1.5231187669990933, + "grad_norm": 0.9789084059029532, + "learning_rate": 7.022282252617424e-05, + "loss": 0.9036, + "step": 3360 + }, + { + "epoch": 1.5235720761559384, + "grad_norm": 1.3849525168381924, + "learning_rate": 7.021452866063662e-05, + "loss": 0.8768, + "step": 3361 + }, + { + "epoch": 1.5240253853127834, + "grad_norm": 0.4741132653280992, + "learning_rate": 7.020623176902518e-05, + "loss": 0.9212, + "step": 3362 + }, + { + "epoch": 1.5244786944696282, + "grad_norm": 1.111462738455124, + "learning_rate": 7.019793185217084e-05, + "loss": 0.8827, + "step": 3363 + }, + { + "epoch": 1.5249320036264733, + "grad_norm": 1.497816980937771, + "learning_rate": 7.018962891090488e-05, + "loss": 0.8846, + "step": 3364 + }, + { + "epoch": 1.5253853127833183, + "grad_norm": 0.6108396594491565, + "learning_rate": 7.018132294605886e-05, + "loss": 0.8955, + "step": 3365 + }, + { + "epoch": 1.5258386219401632, + "grad_norm": 1.6223579011525113, + "learning_rate": 7.017301395846465e-05, + "loss": 0.9209, + "step": 3366 + }, + { + "epoch": 1.526291931097008, + "grad_norm": 0.6580085945863995, + "learning_rate": 7.016470194895441e-05, + "loss": 0.8887, + "step": 3367 + }, + { + "epoch": 1.5267452402538533, + "grad_norm": 1.50853196468573, + "learning_rate": 7.015638691836062e-05, + "loss": 0.9198, + "step": 3368 + }, + { + "epoch": 1.527198549410698, + "grad_norm": 0.7070377971919184, + "learning_rate": 7.014806886751607e-05, + "loss": 0.9115, + "step": 3369 + }, + { + "epoch": 1.527651858567543, + "grad_norm": 1.2103878732040383, + "learning_rate": 7.013974779725378e-05, + "loss": 0.9283, + "step": 3370 + }, + { + "epoch": 1.528105167724388, + "grad_norm": 0.9769799095285435, + "learning_rate": 7.013142370840718e-05, + "loss": 0.9198, + "step": 3371 + }, + { + "epoch": 1.528558476881233, + "grad_norm": 0.9101488604561656, + "learning_rate": 7.012309660180997e-05, + "loss": 0.8972, + "step": 3372 + }, + { + "epoch": 1.5290117860380779, + "grad_norm": 0.8561616380193452, + "learning_rate": 7.011476647829607e-05, + "loss": 0.8887, + "step": 3373 + }, + { + "epoch": 1.529465095194923, + "grad_norm": 0.7317976020934038, + "learning_rate": 7.010643333869983e-05, + "loss": 0.9168, + "step": 3374 + }, + { + "epoch": 1.529918404351768, + "grad_norm": 0.6685895829805628, + "learning_rate": 7.00980971838558e-05, + "loss": 0.9158, + "step": 3375 + }, + { + "epoch": 1.5303717135086128, + "grad_norm": 0.8167512804369763, + "learning_rate": 7.008975801459887e-05, + "loss": 0.9066, + "step": 3376 + }, + { + "epoch": 1.5308250226654578, + "grad_norm": 0.833754564549505, + "learning_rate": 7.008141583176425e-05, + "loss": 0.8832, + "step": 3377 + }, + { + "epoch": 1.5312783318223029, + "grad_norm": 0.8965520665338851, + "learning_rate": 7.007307063618744e-05, + "loss": 0.8929, + "step": 3378 + }, + { + "epoch": 1.5317316409791477, + "grad_norm": 0.886864193927665, + "learning_rate": 7.00647224287042e-05, + "loss": 0.9092, + "step": 3379 + }, + { + "epoch": 1.5321849501359928, + "grad_norm": 0.8699450942950521, + "learning_rate": 7.005637121015065e-05, + "loss": 0.8885, + "step": 3380 + }, + { + "epoch": 1.5326382592928378, + "grad_norm": 0.8137933455051792, + "learning_rate": 7.004801698136322e-05, + "loss": 0.9095, + "step": 3381 + }, + { + "epoch": 1.5330915684496826, + "grad_norm": 0.7285902798827357, + "learning_rate": 7.003965974317854e-05, + "loss": 0.8958, + "step": 3382 + }, + { + "epoch": 1.5335448776065277, + "grad_norm": 0.7090372173601494, + "learning_rate": 7.003129949643368e-05, + "loss": 0.8957, + "step": 3383 + }, + { + "epoch": 1.5339981867633727, + "grad_norm": 0.6531674523852011, + "learning_rate": 7.00229362419659e-05, + "loss": 0.8869, + "step": 3384 + }, + { + "epoch": 1.5344514959202176, + "grad_norm": 0.6003696444961539, + "learning_rate": 7.001456998061284e-05, + "loss": 0.9165, + "step": 3385 + }, + { + "epoch": 1.5349048050770624, + "grad_norm": 0.47392509676360667, + "learning_rate": 7.000620071321236e-05, + "loss": 0.9007, + "step": 3386 + }, + { + "epoch": 1.5353581142339077, + "grad_norm": 0.39918819588074517, + "learning_rate": 6.999782844060271e-05, + "loss": 0.8948, + "step": 3387 + }, + { + "epoch": 1.5358114233907525, + "grad_norm": 0.5221063524958204, + "learning_rate": 6.998945316362237e-05, + "loss": 0.9057, + "step": 3388 + }, + { + "epoch": 1.5362647325475973, + "grad_norm": 0.9228123442906967, + "learning_rate": 6.998107488311016e-05, + "loss": 0.8937, + "step": 3389 + }, + { + "epoch": 1.5367180417044426, + "grad_norm": 0.5343555116261027, + "learning_rate": 6.997269359990519e-05, + "loss": 0.9028, + "step": 3390 + }, + { + "epoch": 1.5371713508612874, + "grad_norm": 0.5828914734242364, + "learning_rate": 6.996430931484687e-05, + "loss": 0.9044, + "step": 3391 + }, + { + "epoch": 1.5376246600181322, + "grad_norm": 0.5322432898200647, + "learning_rate": 6.99559220287749e-05, + "loss": 0.9034, + "step": 3392 + }, + { + "epoch": 1.5380779691749773, + "grad_norm": 0.5744264197458183, + "learning_rate": 6.994753174252931e-05, + "loss": 0.9047, + "step": 3393 + }, + { + "epoch": 1.5385312783318223, + "grad_norm": 0.6694680227336541, + "learning_rate": 6.99391384569504e-05, + "loss": 0.9202, + "step": 3394 + }, + { + "epoch": 1.5389845874886672, + "grad_norm": 0.8034527954030958, + "learning_rate": 6.993074217287877e-05, + "loss": 0.8843, + "step": 3395 + }, + { + "epoch": 1.5394378966455122, + "grad_norm": 1.0222444658171286, + "learning_rate": 6.992234289115536e-05, + "loss": 0.9249, + "step": 3396 + }, + { + "epoch": 1.5398912058023573, + "grad_norm": 1.180064655935955, + "learning_rate": 6.991394061262137e-05, + "loss": 0.9214, + "step": 3397 + }, + { + "epoch": 1.540344514959202, + "grad_norm": 0.7935822458250976, + "learning_rate": 6.99055353381183e-05, + "loss": 0.8932, + "step": 3398 + }, + { + "epoch": 1.5407978241160472, + "grad_norm": 0.6971538712908592, + "learning_rate": 6.989712706848799e-05, + "loss": 0.8871, + "step": 3399 + }, + { + "epoch": 1.5412511332728922, + "grad_norm": 0.6533145569028125, + "learning_rate": 6.988871580457254e-05, + "loss": 0.927, + "step": 3400 + }, + { + "epoch": 1.541704442429737, + "grad_norm": 0.6150714008753683, + "learning_rate": 6.988030154721435e-05, + "loss": 0.8704, + "step": 3401 + }, + { + "epoch": 1.542157751586582, + "grad_norm": 0.4865609556401304, + "learning_rate": 6.987188429725613e-05, + "loss": 0.9193, + "step": 3402 + }, + { + "epoch": 1.5426110607434271, + "grad_norm": 0.6035085221770325, + "learning_rate": 6.986346405554093e-05, + "loss": 0.9232, + "step": 3403 + }, + { + "epoch": 1.543064369900272, + "grad_norm": 0.7311506136717091, + "learning_rate": 6.985504082291203e-05, + "loss": 0.906, + "step": 3404 + }, + { + "epoch": 1.543517679057117, + "grad_norm": 0.8357909530675701, + "learning_rate": 6.984661460021306e-05, + "loss": 0.896, + "step": 3405 + }, + { + "epoch": 1.543970988213962, + "grad_norm": 1.026077470920874, + "learning_rate": 6.98381853882879e-05, + "loss": 0.8744, + "step": 3406 + }, + { + "epoch": 1.5444242973708069, + "grad_norm": 1.150779634252831, + "learning_rate": 6.982975318798079e-05, + "loss": 0.8974, + "step": 3407 + }, + { + "epoch": 1.5448776065276517, + "grad_norm": 0.8562640108628576, + "learning_rate": 6.982131800013623e-05, + "loss": 0.8882, + "step": 3408 + }, + { + "epoch": 1.545330915684497, + "grad_norm": 0.6419332139074357, + "learning_rate": 6.981287982559903e-05, + "loss": 0.9032, + "step": 3409 + }, + { + "epoch": 1.5457842248413418, + "grad_norm": 0.572468817413538, + "learning_rate": 6.98044386652143e-05, + "loss": 0.9135, + "step": 3410 + }, + { + "epoch": 1.5462375339981866, + "grad_norm": 0.5962601192884351, + "learning_rate": 6.979599451982743e-05, + "loss": 0.914, + "step": 3411 + }, + { + "epoch": 1.5466908431550317, + "grad_norm": 0.7821330035186534, + "learning_rate": 6.978754739028416e-05, + "loss": 0.9059, + "step": 3412 + }, + { + "epoch": 1.5471441523118767, + "grad_norm": 0.791601466619343, + "learning_rate": 6.977909727743045e-05, + "loss": 0.9048, + "step": 3413 + }, + { + "epoch": 1.5475974614687216, + "grad_norm": 0.6415978982589958, + "learning_rate": 6.977064418211266e-05, + "loss": 0.9096, + "step": 3414 + }, + { + "epoch": 1.5480507706255666, + "grad_norm": 0.6143459349325524, + "learning_rate": 6.976218810517734e-05, + "loss": 0.8819, + "step": 3415 + }, + { + "epoch": 1.5485040797824117, + "grad_norm": 0.711029069031847, + "learning_rate": 6.975372904747142e-05, + "loss": 0.8853, + "step": 3416 + }, + { + "epoch": 1.5489573889392565, + "grad_norm": 0.8341209328231186, + "learning_rate": 6.97452670098421e-05, + "loss": 0.883, + "step": 3417 + }, + { + "epoch": 1.5494106980961015, + "grad_norm": 0.9248672492845741, + "learning_rate": 6.973680199313684e-05, + "loss": 0.8854, + "step": 3418 + }, + { + "epoch": 1.5498640072529466, + "grad_norm": 1.1583456357142456, + "learning_rate": 6.972833399820348e-05, + "loss": 0.9021, + "step": 3419 + }, + { + "epoch": 1.5503173164097914, + "grad_norm": 0.9140559549931877, + "learning_rate": 6.97198630258901e-05, + "loss": 0.8932, + "step": 3420 + }, + { + "epoch": 1.5507706255666365, + "grad_norm": 0.8411550587773674, + "learning_rate": 6.97113890770451e-05, + "loss": 0.905, + "step": 3421 + }, + { + "epoch": 1.5512239347234815, + "grad_norm": 0.7360667762364022, + "learning_rate": 6.970291215251715e-05, + "loss": 0.9285, + "step": 3422 + }, + { + "epoch": 1.5516772438803264, + "grad_norm": 0.5859223611011717, + "learning_rate": 6.969443225315527e-05, + "loss": 0.8919, + "step": 3423 + }, + { + "epoch": 1.5521305530371714, + "grad_norm": 0.5065398620356175, + "learning_rate": 6.968594937980873e-05, + "loss": 0.9055, + "step": 3424 + }, + { + "epoch": 1.5525838621940165, + "grad_norm": 0.5969844181116204, + "learning_rate": 6.96774635333271e-05, + "loss": 0.8979, + "step": 3425 + }, + { + "epoch": 1.5530371713508613, + "grad_norm": 0.7884158953415088, + "learning_rate": 6.966897471456028e-05, + "loss": 0.9079, + "step": 3426 + }, + { + "epoch": 1.553490480507706, + "grad_norm": 0.9884854114470172, + "learning_rate": 6.966048292435846e-05, + "loss": 0.883, + "step": 3427 + }, + { + "epoch": 1.5539437896645514, + "grad_norm": 1.0706041689487145, + "learning_rate": 6.965198816357209e-05, + "loss": 0.8781, + "step": 3428 + }, + { + "epoch": 1.5543970988213962, + "grad_norm": 0.8713524558609721, + "learning_rate": 6.964349043305195e-05, + "loss": 0.8833, + "step": 3429 + }, + { + "epoch": 1.554850407978241, + "grad_norm": 0.8643355559831445, + "learning_rate": 6.963498973364914e-05, + "loss": 0.8809, + "step": 3430 + }, + { + "epoch": 1.555303717135086, + "grad_norm": 0.9873693246429236, + "learning_rate": 6.962648606621502e-05, + "loss": 0.8994, + "step": 3431 + }, + { + "epoch": 1.5557570262919311, + "grad_norm": 1.0125334462256448, + "learning_rate": 6.961797943160123e-05, + "loss": 0.8991, + "step": 3432 + }, + { + "epoch": 1.556210335448776, + "grad_norm": 0.7979227834705195, + "learning_rate": 6.960946983065976e-05, + "loss": 0.9225, + "step": 3433 + }, + { + "epoch": 1.556663644605621, + "grad_norm": 0.7605212939947471, + "learning_rate": 6.960095726424287e-05, + "loss": 0.8914, + "step": 3434 + }, + { + "epoch": 1.557116953762466, + "grad_norm": 0.982467368105426, + "learning_rate": 6.959244173320311e-05, + "loss": 0.8929, + "step": 3435 + }, + { + "epoch": 1.5575702629193109, + "grad_norm": 0.9126229432614992, + "learning_rate": 6.958392323839334e-05, + "loss": 0.9008, + "step": 3436 + }, + { + "epoch": 1.558023572076156, + "grad_norm": 0.9147660778390037, + "learning_rate": 6.957540178066673e-05, + "loss": 0.9056, + "step": 3437 + }, + { + "epoch": 1.558476881233001, + "grad_norm": 1.040939884411926, + "learning_rate": 6.956687736087669e-05, + "loss": 0.8953, + "step": 3438 + }, + { + "epoch": 1.5589301903898458, + "grad_norm": 1.1737851018063692, + "learning_rate": 6.9558349979877e-05, + "loss": 0.8972, + "step": 3439 + }, + { + "epoch": 1.5593834995466909, + "grad_norm": 0.742000890846898, + "learning_rate": 6.954981963852168e-05, + "loss": 0.8995, + "step": 3440 + }, + { + "epoch": 1.559836808703536, + "grad_norm": 0.42546161088504914, + "learning_rate": 6.954128633766508e-05, + "loss": 0.9062, + "step": 3441 + }, + { + "epoch": 1.5602901178603807, + "grad_norm": 0.4421467208248111, + "learning_rate": 6.953275007816182e-05, + "loss": 0.8928, + "step": 3442 + }, + { + "epoch": 1.5607434270172258, + "grad_norm": 0.7852179747152528, + "learning_rate": 6.952421086086686e-05, + "loss": 0.8996, + "step": 3443 + }, + { + "epoch": 1.5611967361740708, + "grad_norm": 1.194824530971985, + "learning_rate": 6.951566868663542e-05, + "loss": 0.9029, + "step": 3444 + }, + { + "epoch": 1.5616500453309157, + "grad_norm": 0.8820077893227413, + "learning_rate": 6.950712355632301e-05, + "loss": 0.9036, + "step": 3445 + }, + { + "epoch": 1.5621033544877605, + "grad_norm": 0.6196913640747889, + "learning_rate": 6.949857547078545e-05, + "loss": 0.9043, + "step": 3446 + }, + { + "epoch": 1.5625566636446058, + "grad_norm": 0.5204754674157664, + "learning_rate": 6.949002443087886e-05, + "loss": 0.8825, + "step": 3447 + }, + { + "epoch": 1.5630099728014506, + "grad_norm": 0.4574161385072954, + "learning_rate": 6.948147043745967e-05, + "loss": 0.8806, + "step": 3448 + }, + { + "epoch": 1.5634632819582954, + "grad_norm": 0.43641054433260623, + "learning_rate": 6.947291349138455e-05, + "loss": 0.8804, + "step": 3449 + }, + { + "epoch": 1.5639165911151405, + "grad_norm": 0.5079248254867851, + "learning_rate": 6.946435359351052e-05, + "loss": 0.8948, + "step": 3450 + }, + { + "epoch": 1.5643699002719855, + "grad_norm": 0.6111032963942783, + "learning_rate": 6.945579074469491e-05, + "loss": 0.8922, + "step": 3451 + }, + { + "epoch": 1.5648232094288304, + "grad_norm": 1.5685819015900078, + "learning_rate": 6.944722494579527e-05, + "loss": 0.9088, + "step": 3452 + }, + { + "epoch": 1.5652765185856754, + "grad_norm": 0.40374814395006287, + "learning_rate": 6.943865619766952e-05, + "loss": 0.9255, + "step": 3453 + }, + { + "epoch": 1.5657298277425205, + "grad_norm": 0.9966534537545266, + "learning_rate": 6.943008450117582e-05, + "loss": 0.9143, + "step": 3454 + }, + { + "epoch": 1.5661831368993653, + "grad_norm": 1.6065238905285104, + "learning_rate": 6.942150985717266e-05, + "loss": 0.9359, + "step": 3455 + }, + { + "epoch": 1.5666364460562103, + "grad_norm": 0.6146411423743895, + "learning_rate": 6.941293226651883e-05, + "loss": 0.9035, + "step": 3456 + }, + { + "epoch": 1.5670897552130554, + "grad_norm": 1.5187342055275161, + "learning_rate": 6.94043517300734e-05, + "loss": 0.9167, + "step": 3457 + }, + { + "epoch": 1.5675430643699002, + "grad_norm": 0.9029724386166295, + "learning_rate": 6.939576824869571e-05, + "loss": 0.9214, + "step": 3458 + }, + { + "epoch": 1.5679963735267453, + "grad_norm": 1.0426373544466878, + "learning_rate": 6.938718182324546e-05, + "loss": 0.9077, + "step": 3459 + }, + { + "epoch": 1.5684496826835903, + "grad_norm": 1.0283102807760638, + "learning_rate": 6.937859245458254e-05, + "loss": 0.9221, + "step": 3460 + }, + { + "epoch": 1.5689029918404351, + "grad_norm": 1.0007145305639593, + "learning_rate": 6.937000014356728e-05, + "loss": 0.9215, + "step": 3461 + }, + { + "epoch": 1.5693563009972802, + "grad_norm": 1.095064127871526, + "learning_rate": 6.936140489106019e-05, + "loss": 0.9089, + "step": 3462 + }, + { + "epoch": 1.5698096101541252, + "grad_norm": 1.0872916499437553, + "learning_rate": 6.935280669792208e-05, + "loss": 0.899, + "step": 3463 + }, + { + "epoch": 1.57026291931097, + "grad_norm": 0.7978618995313694, + "learning_rate": 6.934420556501414e-05, + "loss": 0.9037, + "step": 3464 + }, + { + "epoch": 1.570716228467815, + "grad_norm": 0.6509039044126267, + "learning_rate": 6.933560149319776e-05, + "loss": 0.9139, + "step": 3465 + }, + { + "epoch": 1.5711695376246602, + "grad_norm": 0.6767452105865385, + "learning_rate": 6.932699448333468e-05, + "loss": 0.8923, + "step": 3466 + }, + { + "epoch": 1.571622846781505, + "grad_norm": 0.607939486034077, + "learning_rate": 6.93183845362869e-05, + "loss": 0.9109, + "step": 3467 + }, + { + "epoch": 1.5720761559383498, + "grad_norm": 0.7156869200550134, + "learning_rate": 6.930977165291676e-05, + "loss": 0.9057, + "step": 3468 + }, + { + "epoch": 1.572529465095195, + "grad_norm": 0.7057062992843104, + "learning_rate": 6.930115583408684e-05, + "loss": 0.893, + "step": 3469 + }, + { + "epoch": 1.57298277425204, + "grad_norm": 0.7127999203650709, + "learning_rate": 6.929253708066004e-05, + "loss": 0.8912, + "step": 3470 + }, + { + "epoch": 1.5734360834088847, + "grad_norm": 0.8789778040263142, + "learning_rate": 6.928391539349959e-05, + "loss": 0.9152, + "step": 3471 + }, + { + "epoch": 1.5738893925657298, + "grad_norm": 0.9973433637040727, + "learning_rate": 6.927529077346892e-05, + "loss": 0.9199, + "step": 3472 + }, + { + "epoch": 1.5743427017225748, + "grad_norm": 1.069991067544346, + "learning_rate": 6.926666322143186e-05, + "loss": 0.8899, + "step": 3473 + }, + { + "epoch": 1.5747960108794197, + "grad_norm": 0.8931699689212121, + "learning_rate": 6.925803273825246e-05, + "loss": 0.8987, + "step": 3474 + }, + { + "epoch": 1.5752493200362647, + "grad_norm": 0.7495576945484714, + "learning_rate": 6.924939932479509e-05, + "loss": 0.8895, + "step": 3475 + }, + { + "epoch": 1.5757026291931098, + "grad_norm": 0.627885771388301, + "learning_rate": 6.924076298192442e-05, + "loss": 0.9083, + "step": 3476 + }, + { + "epoch": 1.5761559383499546, + "grad_norm": 0.5414245600153471, + "learning_rate": 6.923212371050539e-05, + "loss": 0.9007, + "step": 3477 + }, + { + "epoch": 1.5766092475067996, + "grad_norm": 0.42650316011778633, + "learning_rate": 6.922348151140327e-05, + "loss": 0.8728, + "step": 3478 + }, + { + "epoch": 1.5770625566636447, + "grad_norm": 0.48922025603627517, + "learning_rate": 6.921483638548358e-05, + "loss": 0.8788, + "step": 3479 + }, + { + "epoch": 1.5775158658204895, + "grad_norm": 0.6618573444939764, + "learning_rate": 6.920618833361218e-05, + "loss": 0.8982, + "step": 3480 + }, + { + "epoch": 1.5779691749773346, + "grad_norm": 0.7672499395943403, + "learning_rate": 6.919753735665517e-05, + "loss": 0.8826, + "step": 3481 + }, + { + "epoch": 1.5784224841341796, + "grad_norm": 0.8680653944125539, + "learning_rate": 6.918888345547898e-05, + "loss": 0.9068, + "step": 3482 + }, + { + "epoch": 1.5788757932910245, + "grad_norm": 0.9483423307200312, + "learning_rate": 6.918022663095035e-05, + "loss": 0.9154, + "step": 3483 + }, + { + "epoch": 1.5793291024478693, + "grad_norm": 1.156731256192679, + "learning_rate": 6.917156688393624e-05, + "loss": 0.8998, + "step": 3484 + }, + { + "epoch": 1.5797824116047146, + "grad_norm": 0.7952260685534042, + "learning_rate": 6.916290421530398e-05, + "loss": 0.8843, + "step": 3485 + }, + { + "epoch": 1.5802357207615594, + "grad_norm": 0.6311029567435766, + "learning_rate": 6.915423862592116e-05, + "loss": 0.8964, + "step": 3486 + }, + { + "epoch": 1.5806890299184042, + "grad_norm": 0.7196815630305526, + "learning_rate": 6.914557011665566e-05, + "loss": 0.8888, + "step": 3487 + }, + { + "epoch": 1.5811423390752495, + "grad_norm": 0.7653997489374949, + "learning_rate": 6.913689868837564e-05, + "loss": 0.9083, + "step": 3488 + }, + { + "epoch": 1.5815956482320943, + "grad_norm": 0.7094430965687532, + "learning_rate": 6.91282243419496e-05, + "loss": 0.9025, + "step": 3489 + }, + { + "epoch": 1.5820489573889391, + "grad_norm": 0.5746846798808669, + "learning_rate": 6.911954707824627e-05, + "loss": 0.8915, + "step": 3490 + }, + { + "epoch": 1.5825022665457842, + "grad_norm": 0.48119994967181157, + "learning_rate": 6.911086689813473e-05, + "loss": 0.8794, + "step": 3491 + }, + { + "epoch": 1.5829555757026292, + "grad_norm": 0.5783284728165339, + "learning_rate": 6.910218380248433e-05, + "loss": 0.9377, + "step": 3492 + }, + { + "epoch": 1.583408884859474, + "grad_norm": 0.7731426429567084, + "learning_rate": 6.909349779216466e-05, + "loss": 0.8959, + "step": 3493 + }, + { + "epoch": 1.5838621940163191, + "grad_norm": 1.0121495279820456, + "learning_rate": 6.908480886804572e-05, + "loss": 0.892, + "step": 3494 + }, + { + "epoch": 1.5843155031731642, + "grad_norm": 1.1830689501194072, + "learning_rate": 6.907611703099767e-05, + "loss": 0.9122, + "step": 3495 + }, + { + "epoch": 1.584768812330009, + "grad_norm": 0.4864961538207769, + "learning_rate": 6.906742228189105e-05, + "loss": 0.8983, + "step": 3496 + }, + { + "epoch": 1.585222121486854, + "grad_norm": 0.3634061184925307, + "learning_rate": 6.905872462159667e-05, + "loss": 0.9106, + "step": 3497 + }, + { + "epoch": 1.585675430643699, + "grad_norm": 0.6408574796435869, + "learning_rate": 6.90500240509856e-05, + "loss": 0.9033, + "step": 3498 + }, + { + "epoch": 1.586128739800544, + "grad_norm": 1.0534346334817264, + "learning_rate": 6.904132057092925e-05, + "loss": 0.8979, + "step": 3499 + }, + { + "epoch": 1.586582048957389, + "grad_norm": 1.2632171406502637, + "learning_rate": 6.90326141822993e-05, + "loss": 0.8998, + "step": 3500 + }, + { + "epoch": 1.587035358114234, + "grad_norm": 0.5941946024455969, + "learning_rate": 6.902390488596772e-05, + "loss": 0.9142, + "step": 3501 + }, + { + "epoch": 1.5874886672710788, + "grad_norm": 0.5120139785163965, + "learning_rate": 6.901519268280674e-05, + "loss": 0.9169, + "step": 3502 + }, + { + "epoch": 1.587941976427924, + "grad_norm": 0.9547296666082348, + "learning_rate": 6.900647757368897e-05, + "loss": 0.8875, + "step": 3503 + }, + { + "epoch": 1.588395285584769, + "grad_norm": 1.4399489599626973, + "learning_rate": 6.89977595594872e-05, + "loss": 0.9045, + "step": 3504 + }, + { + "epoch": 1.5888485947416138, + "grad_norm": 0.4418612797402434, + "learning_rate": 6.898903864107459e-05, + "loss": 0.91, + "step": 3505 + }, + { + "epoch": 1.5893019038984586, + "grad_norm": 1.1302565903994612, + "learning_rate": 6.898031481932457e-05, + "loss": 0.8933, + "step": 3506 + }, + { + "epoch": 1.5897552130553039, + "grad_norm": 1.314445550346696, + "learning_rate": 6.897158809511085e-05, + "loss": 0.9006, + "step": 3507 + }, + { + "epoch": 1.5902085222121487, + "grad_norm": 0.6528430974958581, + "learning_rate": 6.896285846930744e-05, + "loss": 0.904, + "step": 3508 + }, + { + "epoch": 1.5906618313689935, + "grad_norm": 1.0282358447529585, + "learning_rate": 6.895412594278862e-05, + "loss": 0.913, + "step": 3509 + }, + { + "epoch": 1.5911151405258386, + "grad_norm": 1.3116471961910516, + "learning_rate": 6.8945390516429e-05, + "loss": 0.9021, + "step": 3510 + }, + { + "epoch": 1.5915684496826836, + "grad_norm": 0.6634565174511762, + "learning_rate": 6.893665219110346e-05, + "loss": 0.8976, + "step": 3511 + }, + { + "epoch": 1.5920217588395285, + "grad_norm": 1.1665099730274597, + "learning_rate": 6.892791096768713e-05, + "loss": 0.9071, + "step": 3512 + }, + { + "epoch": 1.5924750679963735, + "grad_norm": 0.955441087396011, + "learning_rate": 6.891916684705554e-05, + "loss": 0.8959, + "step": 3513 + }, + { + "epoch": 1.5929283771532186, + "grad_norm": 0.8699826377961576, + "learning_rate": 6.891041983008437e-05, + "loss": 0.9118, + "step": 3514 + }, + { + "epoch": 1.5933816863100634, + "grad_norm": 1.0023910274672492, + "learning_rate": 6.89016699176497e-05, + "loss": 0.9154, + "step": 3515 + }, + { + "epoch": 1.5938349954669084, + "grad_norm": 0.8842493081990018, + "learning_rate": 6.889291711062784e-05, + "loss": 0.8958, + "step": 3516 + }, + { + "epoch": 1.5942883046237535, + "grad_norm": 0.9121814086242158, + "learning_rate": 6.888416140989542e-05, + "loss": 0.89, + "step": 3517 + }, + { + "epoch": 1.5947416137805983, + "grad_norm": 1.0061300605239711, + "learning_rate": 6.887540281632934e-05, + "loss": 0.8772, + "step": 3518 + }, + { + "epoch": 1.5951949229374434, + "grad_norm": 0.8829363064192381, + "learning_rate": 6.886664133080681e-05, + "loss": 0.8934, + "step": 3519 + }, + { + "epoch": 1.5956482320942884, + "grad_norm": 0.7740001831175977, + "learning_rate": 6.88578769542053e-05, + "loss": 0.9136, + "step": 3520 + }, + { + "epoch": 1.5961015412511332, + "grad_norm": 0.6317155146785054, + "learning_rate": 6.884910968740264e-05, + "loss": 0.8864, + "step": 3521 + }, + { + "epoch": 1.5965548504079783, + "grad_norm": 0.5532794390073704, + "learning_rate": 6.884033953127683e-05, + "loss": 0.9064, + "step": 3522 + }, + { + "epoch": 1.5970081595648233, + "grad_norm": 0.5372686427221548, + "learning_rate": 6.883156648670626e-05, + "loss": 0.8926, + "step": 3523 + }, + { + "epoch": 1.5974614687216682, + "grad_norm": 0.5870798032007819, + "learning_rate": 6.882279055456956e-05, + "loss": 0.8883, + "step": 3524 + }, + { + "epoch": 1.597914777878513, + "grad_norm": 0.5981439537058033, + "learning_rate": 6.88140117357457e-05, + "loss": 0.897, + "step": 3525 + }, + { + "epoch": 1.5983680870353583, + "grad_norm": 0.6892704528813236, + "learning_rate": 6.880523003111387e-05, + "loss": 0.9017, + "step": 3526 + }, + { + "epoch": 1.598821396192203, + "grad_norm": 0.7381602467190929, + "learning_rate": 6.87964454415536e-05, + "loss": 0.9088, + "step": 3527 + }, + { + "epoch": 1.599274705349048, + "grad_norm": 0.7306021159213048, + "learning_rate": 6.878765796794467e-05, + "loss": 0.8925, + "step": 3528 + }, + { + "epoch": 1.599728014505893, + "grad_norm": 0.7751719087247492, + "learning_rate": 6.877886761116721e-05, + "loss": 0.8836, + "step": 3529 + }, + { + "epoch": 1.600181323662738, + "grad_norm": 0.8028622201532575, + "learning_rate": 6.877007437210157e-05, + "loss": 0.9233, + "step": 3530 + }, + { + "epoch": 1.6006346328195828, + "grad_norm": 0.8287567926352678, + "learning_rate": 6.876127825162843e-05, + "loss": 0.8952, + "step": 3531 + }, + { + "epoch": 1.601087941976428, + "grad_norm": 0.8324450514479146, + "learning_rate": 6.875247925062875e-05, + "loss": 0.9065, + "step": 3532 + }, + { + "epoch": 1.601541251133273, + "grad_norm": 0.8212973184583815, + "learning_rate": 6.874367736998377e-05, + "loss": 0.9022, + "step": 3533 + }, + { + "epoch": 1.6019945602901178, + "grad_norm": 0.9071916685634169, + "learning_rate": 6.873487261057501e-05, + "loss": 0.9018, + "step": 3534 + }, + { + "epoch": 1.6024478694469628, + "grad_norm": 1.023372185902829, + "learning_rate": 6.872606497328433e-05, + "loss": 0.8669, + "step": 3535 + }, + { + "epoch": 1.6029011786038079, + "grad_norm": 0.8402389101488351, + "learning_rate": 6.87172544589938e-05, + "loss": 0.8916, + "step": 3536 + }, + { + "epoch": 1.6033544877606527, + "grad_norm": 0.6694585152906725, + "learning_rate": 6.870844106858585e-05, + "loss": 0.8977, + "step": 3537 + }, + { + "epoch": 1.6038077969174978, + "grad_norm": 0.47013145062162326, + "learning_rate": 6.869962480294315e-05, + "loss": 0.8916, + "step": 3538 + }, + { + "epoch": 1.6042611060743428, + "grad_norm": 0.4905640745963116, + "learning_rate": 6.869080566294868e-05, + "loss": 0.8982, + "step": 3539 + }, + { + "epoch": 1.6047144152311876, + "grad_norm": 0.6950153420816058, + "learning_rate": 6.86819836494857e-05, + "loss": 0.9318, + "step": 3540 + }, + { + "epoch": 1.6051677243880327, + "grad_norm": 0.38223484725197376, + "learning_rate": 6.867315876343776e-05, + "loss": 0.8975, + "step": 3541 + }, + { + "epoch": 1.6056210335448777, + "grad_norm": 0.46016199562336324, + "learning_rate": 6.866433100568871e-05, + "loss": 0.8929, + "step": 3542 + }, + { + "epoch": 1.6060743427017226, + "grad_norm": 0.48079690486640153, + "learning_rate": 6.865550037712267e-05, + "loss": 0.9183, + "step": 3543 + }, + { + "epoch": 1.6065276518585674, + "grad_norm": 0.4671410304068627, + "learning_rate": 6.864666687862404e-05, + "loss": 0.911, + "step": 3544 + }, + { + "epoch": 1.6069809610154127, + "grad_norm": 0.5243218314357839, + "learning_rate": 6.863783051107753e-05, + "loss": 0.896, + "step": 3545 + }, + { + "epoch": 1.6074342701722575, + "grad_norm": 0.6184754206189658, + "learning_rate": 6.862899127536814e-05, + "loss": 0.91, + "step": 3546 + }, + { + "epoch": 1.6078875793291023, + "grad_norm": 0.7839382450978375, + "learning_rate": 6.862014917238112e-05, + "loss": 0.9126, + "step": 3547 + }, + { + "epoch": 1.6083408884859474, + "grad_norm": 0.7956009851337998, + "learning_rate": 6.861130420300205e-05, + "loss": 0.8831, + "step": 3548 + }, + { + "epoch": 1.6087941976427924, + "grad_norm": 0.8970589514474093, + "learning_rate": 6.860245636811679e-05, + "loss": 0.9037, + "step": 3549 + }, + { + "epoch": 1.6092475067996372, + "grad_norm": 1.0722938229124273, + "learning_rate": 6.859360566861145e-05, + "loss": 0.9074, + "step": 3550 + }, + { + "epoch": 1.6097008159564823, + "grad_norm": 0.9411551064417908, + "learning_rate": 6.858475210537248e-05, + "loss": 0.8965, + "step": 3551 + }, + { + "epoch": 1.6101541251133273, + "grad_norm": 0.786067315598286, + "learning_rate": 6.857589567928657e-05, + "loss": 0.8784, + "step": 3552 + }, + { + "epoch": 1.6106074342701722, + "grad_norm": 0.6288063798432751, + "learning_rate": 6.856703639124072e-05, + "loss": 0.8685, + "step": 3553 + }, + { + "epoch": 1.6110607434270172, + "grad_norm": 0.4789087145681329, + "learning_rate": 6.855817424212224e-05, + "loss": 0.8954, + "step": 3554 + }, + { + "epoch": 1.6115140525838623, + "grad_norm": 0.5064985476311177, + "learning_rate": 6.854930923281866e-05, + "loss": 0.9092, + "step": 3555 + }, + { + "epoch": 1.611967361740707, + "grad_norm": 0.5195751664602414, + "learning_rate": 6.854044136421784e-05, + "loss": 0.9087, + "step": 3556 + }, + { + "epoch": 1.6124206708975521, + "grad_norm": 0.571452101387126, + "learning_rate": 6.853157063720796e-05, + "loss": 0.8919, + "step": 3557 + }, + { + "epoch": 1.6128739800543972, + "grad_norm": 0.5507847608018447, + "learning_rate": 6.852269705267743e-05, + "loss": 0.9199, + "step": 3558 + }, + { + "epoch": 1.613327289211242, + "grad_norm": 0.6065794885807064, + "learning_rate": 6.851382061151496e-05, + "loss": 0.91, + "step": 3559 + }, + { + "epoch": 1.613780598368087, + "grad_norm": 0.7928312160475531, + "learning_rate": 6.850494131460955e-05, + "loss": 0.8954, + "step": 3560 + }, + { + "epoch": 1.6142339075249321, + "grad_norm": 0.9674501781219166, + "learning_rate": 6.84960591628505e-05, + "loss": 0.9014, + "step": 3561 + }, + { + "epoch": 1.614687216681777, + "grad_norm": 1.1199436164614807, + "learning_rate": 6.848717415712737e-05, + "loss": 0.8925, + "step": 3562 + }, + { + "epoch": 1.6151405258386218, + "grad_norm": 0.8156634694063205, + "learning_rate": 6.847828629833002e-05, + "loss": 0.8845, + "step": 3563 + }, + { + "epoch": 1.615593834995467, + "grad_norm": 0.6375156251789444, + "learning_rate": 6.846939558734862e-05, + "loss": 0.9097, + "step": 3564 + }, + { + "epoch": 1.6160471441523119, + "grad_norm": 0.5887163158184114, + "learning_rate": 6.846050202507358e-05, + "loss": 0.9087, + "step": 3565 + }, + { + "epoch": 1.6165004533091567, + "grad_norm": 0.6279702972559602, + "learning_rate": 6.84516056123956e-05, + "loss": 0.9022, + "step": 3566 + }, + { + "epoch": 1.616953762466002, + "grad_norm": 0.6816655770533142, + "learning_rate": 6.844270635020571e-05, + "loss": 0.8851, + "step": 3567 + }, + { + "epoch": 1.6174070716228468, + "grad_norm": 0.6703459016882779, + "learning_rate": 6.843380423939519e-05, + "loss": 0.9001, + "step": 3568 + }, + { + "epoch": 1.6178603807796916, + "grad_norm": 0.7050315358211701, + "learning_rate": 6.84248992808556e-05, + "loss": 0.8983, + "step": 3569 + }, + { + "epoch": 1.6183136899365367, + "grad_norm": 0.7206324005292364, + "learning_rate": 6.841599147547881e-05, + "loss": 0.9067, + "step": 3570 + }, + { + "epoch": 1.6187669990933817, + "grad_norm": 0.7907283833518338, + "learning_rate": 6.840708082415698e-05, + "loss": 0.8964, + "step": 3571 + }, + { + "epoch": 1.6192203082502266, + "grad_norm": 0.8946711602852425, + "learning_rate": 6.839816732778251e-05, + "loss": 0.9122, + "step": 3572 + }, + { + "epoch": 1.6196736174070716, + "grad_norm": 1.026824527683341, + "learning_rate": 6.83892509872481e-05, + "loss": 0.9053, + "step": 3573 + }, + { + "epoch": 1.6201269265639167, + "grad_norm": 1.087309805929588, + "learning_rate": 6.83803318034468e-05, + "loss": 0.8969, + "step": 3574 + }, + { + "epoch": 1.6205802357207615, + "grad_norm": 0.8134494673736399, + "learning_rate": 6.837140977727183e-05, + "loss": 0.8907, + "step": 3575 + }, + { + "epoch": 1.6210335448776065, + "grad_norm": 0.6260176329622036, + "learning_rate": 6.836248490961681e-05, + "loss": 0.89, + "step": 3576 + }, + { + "epoch": 1.6214868540344516, + "grad_norm": 0.5996344648119839, + "learning_rate": 6.835355720137556e-05, + "loss": 0.9074, + "step": 3577 + }, + { + "epoch": 1.6219401631912964, + "grad_norm": 0.7918760041083078, + "learning_rate": 6.834462665344224e-05, + "loss": 0.9141, + "step": 3578 + }, + { + "epoch": 1.6223934723481415, + "grad_norm": 0.8645609126420297, + "learning_rate": 6.833569326671125e-05, + "loss": 0.9037, + "step": 3579 + }, + { + "epoch": 1.6228467815049865, + "grad_norm": 0.772092561099653, + "learning_rate": 6.83267570420773e-05, + "loss": 0.9234, + "step": 3580 + }, + { + "epoch": 1.6233000906618313, + "grad_norm": 0.7439394560451666, + "learning_rate": 6.831781798043538e-05, + "loss": 0.8805, + "step": 3581 + }, + { + "epoch": 1.6237533998186764, + "grad_norm": 0.872882747883814, + "learning_rate": 6.830887608268078e-05, + "loss": 0.9161, + "step": 3582 + }, + { + "epoch": 1.6242067089755214, + "grad_norm": 1.103219780112794, + "learning_rate": 6.829993134970903e-05, + "loss": 0.901, + "step": 3583 + }, + { + "epoch": 1.6246600181323663, + "grad_norm": 0.9519555964713374, + "learning_rate": 6.829098378241598e-05, + "loss": 0.902, + "step": 3584 + }, + { + "epoch": 1.625113327289211, + "grad_norm": 0.8057963417710626, + "learning_rate": 6.828203338169775e-05, + "loss": 0.8987, + "step": 3585 + }, + { + "epoch": 1.6255666364460564, + "grad_norm": 0.6976793922840698, + "learning_rate": 6.827308014845078e-05, + "loss": 0.9068, + "step": 3586 + }, + { + "epoch": 1.6260199456029012, + "grad_norm": 0.6189335870160746, + "learning_rate": 6.826412408357174e-05, + "loss": 0.8892, + "step": 3587 + }, + { + "epoch": 1.626473254759746, + "grad_norm": 0.57753747397956, + "learning_rate": 6.82551651879576e-05, + "loss": 0.8804, + "step": 3588 + }, + { + "epoch": 1.626926563916591, + "grad_norm": 0.4977009010773931, + "learning_rate": 6.82462034625056e-05, + "loss": 0.8915, + "step": 3589 + }, + { + "epoch": 1.6273798730734361, + "grad_norm": 0.5367653360098366, + "learning_rate": 6.823723890811334e-05, + "loss": 0.9062, + "step": 3590 + }, + { + "epoch": 1.627833182230281, + "grad_norm": 0.6208195288993608, + "learning_rate": 6.822827152567861e-05, + "loss": 0.9077, + "step": 3591 + }, + { + "epoch": 1.628286491387126, + "grad_norm": 0.637917027011857, + "learning_rate": 6.821930131609952e-05, + "loss": 0.8942, + "step": 3592 + }, + { + "epoch": 1.628739800543971, + "grad_norm": 0.6835031078574401, + "learning_rate": 6.821032828027446e-05, + "loss": 0.9054, + "step": 3593 + }, + { + "epoch": 1.6291931097008159, + "grad_norm": 0.8221001229037813, + "learning_rate": 6.820135241910214e-05, + "loss": 0.8876, + "step": 3594 + }, + { + "epoch": 1.629646418857661, + "grad_norm": 1.0058274100037405, + "learning_rate": 6.819237373348147e-05, + "loss": 0.9005, + "step": 3595 + }, + { + "epoch": 1.630099728014506, + "grad_norm": 1.1072104219587726, + "learning_rate": 6.818339222431173e-05, + "loss": 0.922, + "step": 3596 + }, + { + "epoch": 1.6305530371713508, + "grad_norm": 0.8095851153057454, + "learning_rate": 6.817440789249242e-05, + "loss": 0.8844, + "step": 3597 + }, + { + "epoch": 1.6310063463281959, + "grad_norm": 0.5972652184055249, + "learning_rate": 6.816542073892335e-05, + "loss": 0.9032, + "step": 3598 + }, + { + "epoch": 1.631459655485041, + "grad_norm": 0.4597924359755618, + "learning_rate": 6.815643076450464e-05, + "loss": 0.9068, + "step": 3599 + }, + { + "epoch": 1.6319129646418857, + "grad_norm": 0.354782797471971, + "learning_rate": 6.814743797013661e-05, + "loss": 0.8955, + "step": 3600 + }, + { + "epoch": 1.6323662737987308, + "grad_norm": 0.4419524471799645, + "learning_rate": 6.813844235671996e-05, + "loss": 0.8958, + "step": 3601 + }, + { + "epoch": 1.6328195829555758, + "grad_norm": 0.6275067149799717, + "learning_rate": 6.81294439251556e-05, + "loss": 0.8846, + "step": 3602 + }, + { + "epoch": 1.6332728921124207, + "grad_norm": 0.7460816002068694, + "learning_rate": 6.812044267634478e-05, + "loss": 0.8925, + "step": 3603 + }, + { + "epoch": 1.6337262012692655, + "grad_norm": 0.8585609070663845, + "learning_rate": 6.811143861118897e-05, + "loss": 0.911, + "step": 3604 + }, + { + "epoch": 1.6341795104261108, + "grad_norm": 0.968042733196462, + "learning_rate": 6.810243173058996e-05, + "loss": 0.9064, + "step": 3605 + }, + { + "epoch": 1.6346328195829556, + "grad_norm": 1.1540774982159099, + "learning_rate": 6.809342203544983e-05, + "loss": 0.9083, + "step": 3606 + }, + { + "epoch": 1.6350861287398004, + "grad_norm": 0.7536147595911316, + "learning_rate": 6.808440952667091e-05, + "loss": 0.8901, + "step": 3607 + }, + { + "epoch": 1.6355394378966455, + "grad_norm": 0.5006995481147188, + "learning_rate": 6.807539420515584e-05, + "loss": 0.9093, + "step": 3608 + }, + { + "epoch": 1.6359927470534905, + "grad_norm": 0.3862990054187506, + "learning_rate": 6.806637607180753e-05, + "loss": 0.8893, + "step": 3609 + }, + { + "epoch": 1.6364460562103353, + "grad_norm": 0.4610610609597157, + "learning_rate": 6.805735512752917e-05, + "loss": 0.896, + "step": 3610 + }, + { + "epoch": 1.6368993653671804, + "grad_norm": 0.5494738055477415, + "learning_rate": 6.804833137322423e-05, + "loss": 0.9134, + "step": 3611 + }, + { + "epoch": 1.6373526745240254, + "grad_norm": 0.6603965489130472, + "learning_rate": 6.803930480979647e-05, + "loss": 0.8774, + "step": 3612 + }, + { + "epoch": 1.6378059836808703, + "grad_norm": 0.8174145941668239, + "learning_rate": 6.803027543814993e-05, + "loss": 0.9178, + "step": 3613 + }, + { + "epoch": 1.6382592928377153, + "grad_norm": 0.9111377701935647, + "learning_rate": 6.802124325918893e-05, + "loss": 0.8862, + "step": 3614 + }, + { + "epoch": 1.6387126019945604, + "grad_norm": 1.0307734903419772, + "learning_rate": 6.801220827381807e-05, + "loss": 0.9096, + "step": 3615 + }, + { + "epoch": 1.6391659111514052, + "grad_norm": 1.0152000356689281, + "learning_rate": 6.800317048294221e-05, + "loss": 0.9017, + "step": 3616 + }, + { + "epoch": 1.6396192203082502, + "grad_norm": 0.9345178904602077, + "learning_rate": 6.799412988746653e-05, + "loss": 0.892, + "step": 3617 + }, + { + "epoch": 1.6400725294650953, + "grad_norm": 0.8503740206474986, + "learning_rate": 6.798508648829649e-05, + "loss": 0.9054, + "step": 3618 + }, + { + "epoch": 1.6405258386219401, + "grad_norm": 0.8173227326952438, + "learning_rate": 6.797604028633777e-05, + "loss": 0.8846, + "step": 3619 + }, + { + "epoch": 1.6409791477787852, + "grad_norm": 0.78506646673894, + "learning_rate": 6.796699128249639e-05, + "loss": 0.8843, + "step": 3620 + }, + { + "epoch": 1.6414324569356302, + "grad_norm": 0.7206491050075412, + "learning_rate": 6.795793947767865e-05, + "loss": 0.8951, + "step": 3621 + }, + { + "epoch": 1.641885766092475, + "grad_norm": 0.6466286842039385, + "learning_rate": 6.794888487279111e-05, + "loss": 0.8966, + "step": 3622 + }, + { + "epoch": 1.6423390752493199, + "grad_norm": 0.5722235062625487, + "learning_rate": 6.79398274687406e-05, + "loss": 0.894, + "step": 3623 + }, + { + "epoch": 1.6427923844061652, + "grad_norm": 0.48202875307759907, + "learning_rate": 6.793076726643426e-05, + "loss": 0.8736, + "step": 3624 + }, + { + "epoch": 1.64324569356301, + "grad_norm": 0.45770681124832197, + "learning_rate": 6.792170426677949e-05, + "loss": 0.8841, + "step": 3625 + }, + { + "epoch": 1.6436990027198548, + "grad_norm": 0.45061221530577905, + "learning_rate": 6.791263847068397e-05, + "loss": 0.8914, + "step": 3626 + }, + { + "epoch": 1.6441523118766999, + "grad_norm": 0.40975052532294676, + "learning_rate": 6.790356987905568e-05, + "loss": 0.8951, + "step": 3627 + }, + { + "epoch": 1.644605621033545, + "grad_norm": 0.40729797859661043, + "learning_rate": 6.789449849280286e-05, + "loss": 0.8854, + "step": 3628 + }, + { + "epoch": 1.6450589301903897, + "grad_norm": 0.4760270211102829, + "learning_rate": 6.788542431283403e-05, + "loss": 0.9038, + "step": 3629 + }, + { + "epoch": 1.6455122393472348, + "grad_norm": 0.5525679856005661, + "learning_rate": 6.787634734005801e-05, + "loss": 0.9095, + "step": 3630 + }, + { + "epoch": 1.6459655485040798, + "grad_norm": 0.6458016338095786, + "learning_rate": 6.786726757538387e-05, + "loss": 0.8895, + "step": 3631 + }, + { + "epoch": 1.6464188576609247, + "grad_norm": 0.7300418732779198, + "learning_rate": 6.785818501972099e-05, + "loss": 0.9084, + "step": 3632 + }, + { + "epoch": 1.6468721668177697, + "grad_norm": 0.7897662751923281, + "learning_rate": 6.7849099673979e-05, + "loss": 0.9093, + "step": 3633 + }, + { + "epoch": 1.6473254759746148, + "grad_norm": 0.9235213687287538, + "learning_rate": 6.784001153906783e-05, + "loss": 0.8728, + "step": 3634 + }, + { + "epoch": 1.6477787851314596, + "grad_norm": 1.040689805636278, + "learning_rate": 6.783092061589769e-05, + "loss": 0.9007, + "step": 3635 + }, + { + "epoch": 1.6482320942883046, + "grad_norm": 1.0169434785416702, + "learning_rate": 6.782182690537905e-05, + "loss": 0.8947, + "step": 3636 + }, + { + "epoch": 1.6486854034451497, + "grad_norm": 0.8831521880702005, + "learning_rate": 6.781273040842269e-05, + "loss": 0.8976, + "step": 3637 + }, + { + "epoch": 1.6491387126019945, + "grad_norm": 0.7643503472295496, + "learning_rate": 6.780363112593962e-05, + "loss": 0.9015, + "step": 3638 + }, + { + "epoch": 1.6495920217588396, + "grad_norm": 0.6081668797309726, + "learning_rate": 6.779452905884119e-05, + "loss": 0.8964, + "step": 3639 + }, + { + "epoch": 1.6500453309156846, + "grad_norm": 0.4635981485410173, + "learning_rate": 6.778542420803899e-05, + "loss": 0.8932, + "step": 3640 + }, + { + "epoch": 1.6504986400725294, + "grad_norm": 0.43823324239202754, + "learning_rate": 6.77763165744449e-05, + "loss": 0.9052, + "step": 3641 + }, + { + "epoch": 1.6509519492293743, + "grad_norm": 0.5583505297955579, + "learning_rate": 6.776720615897107e-05, + "loss": 0.8981, + "step": 3642 + }, + { + "epoch": 1.6514052583862195, + "grad_norm": 0.7704029272789742, + "learning_rate": 6.775809296252994e-05, + "loss": 0.8918, + "step": 3643 + }, + { + "epoch": 1.6518585675430644, + "grad_norm": 0.996572135511601, + "learning_rate": 6.77489769860342e-05, + "loss": 0.9142, + "step": 3644 + }, + { + "epoch": 1.6523118766999092, + "grad_norm": 1.1759882977162128, + "learning_rate": 6.773985823039689e-05, + "loss": 0.8821, + "step": 3645 + }, + { + "epoch": 1.6527651858567545, + "grad_norm": 0.7111418591841604, + "learning_rate": 6.773073669653123e-05, + "loss": 0.8918, + "step": 3646 + }, + { + "epoch": 1.6532184950135993, + "grad_norm": 0.5069146978260031, + "learning_rate": 6.772161238535079e-05, + "loss": 0.9174, + "step": 3647 + }, + { + "epoch": 1.6536718041704441, + "grad_norm": 0.5277908597859619, + "learning_rate": 6.771248529776941e-05, + "loss": 0.897, + "step": 3648 + }, + { + "epoch": 1.6541251133272892, + "grad_norm": 0.5348059146603726, + "learning_rate": 6.770335543470119e-05, + "loss": 0.8985, + "step": 3649 + }, + { + "epoch": 1.6545784224841342, + "grad_norm": 0.5616280633571261, + "learning_rate": 6.769422279706048e-05, + "loss": 0.9051, + "step": 3650 + }, + { + "epoch": 1.655031731640979, + "grad_norm": 0.639606168592396, + "learning_rate": 6.768508738576198e-05, + "loss": 0.8811, + "step": 3651 + }, + { + "epoch": 1.655485040797824, + "grad_norm": 0.8508344692642427, + "learning_rate": 6.76759492017206e-05, + "loss": 0.8886, + "step": 3652 + }, + { + "epoch": 1.6559383499546692, + "grad_norm": 1.109872429133024, + "learning_rate": 6.76668082458516e-05, + "loss": 0.9105, + "step": 3653 + }, + { + "epoch": 1.656391659111514, + "grad_norm": 0.9279523516546986, + "learning_rate": 6.765766451907042e-05, + "loss": 0.8766, + "step": 3654 + }, + { + "epoch": 1.656844968268359, + "grad_norm": 0.716245588675174, + "learning_rate": 6.764851802229284e-05, + "loss": 0.8944, + "step": 3655 + }, + { + "epoch": 1.657298277425204, + "grad_norm": 0.5979038407975756, + "learning_rate": 6.763936875643495e-05, + "loss": 0.9021, + "step": 3656 + }, + { + "epoch": 1.657751586582049, + "grad_norm": 0.5126493550522623, + "learning_rate": 6.763021672241305e-05, + "loss": 0.9124, + "step": 3657 + }, + { + "epoch": 1.658204895738894, + "grad_norm": 0.4817223539229461, + "learning_rate": 6.762106192114372e-05, + "loss": 0.8955, + "step": 3658 + }, + { + "epoch": 1.658658204895739, + "grad_norm": 0.553614947200673, + "learning_rate": 6.761190435354387e-05, + "loss": 0.8946, + "step": 3659 + }, + { + "epoch": 1.6591115140525838, + "grad_norm": 0.6142691680743441, + "learning_rate": 6.760274402053064e-05, + "loss": 0.9153, + "step": 3660 + }, + { + "epoch": 1.6595648232094289, + "grad_norm": 0.7417867760255848, + "learning_rate": 6.75935809230215e-05, + "loss": 0.9012, + "step": 3661 + }, + { + "epoch": 1.660018132366274, + "grad_norm": 0.9284362010428335, + "learning_rate": 6.758441506193412e-05, + "loss": 0.8652, + "step": 3662 + }, + { + "epoch": 1.6604714415231188, + "grad_norm": 1.1877643344920266, + "learning_rate": 6.75752464381865e-05, + "loss": 0.9111, + "step": 3663 + }, + { + "epoch": 1.6609247506799636, + "grad_norm": 0.7336975485877402, + "learning_rate": 6.756607505269691e-05, + "loss": 0.89, + "step": 3664 + }, + { + "epoch": 1.6613780598368089, + "grad_norm": 0.49857394156775864, + "learning_rate": 6.755690090638388e-05, + "loss": 0.9013, + "step": 3665 + }, + { + "epoch": 1.6618313689936537, + "grad_norm": 0.6023559825660622, + "learning_rate": 6.754772400016624e-05, + "loss": 0.8933, + "step": 3666 + }, + { + "epoch": 1.6622846781504985, + "grad_norm": 0.8634041229904621, + "learning_rate": 6.753854433496308e-05, + "loss": 0.8761, + "step": 3667 + }, + { + "epoch": 1.6627379873073436, + "grad_norm": 1.0162664068129867, + "learning_rate": 6.752936191169378e-05, + "loss": 0.8923, + "step": 3668 + }, + { + "epoch": 1.6631912964641886, + "grad_norm": 0.9127584389342842, + "learning_rate": 6.752017673127797e-05, + "loss": 0.9115, + "step": 3669 + }, + { + "epoch": 1.6636446056210334, + "grad_norm": 0.7303160201974402, + "learning_rate": 6.751098879463558e-05, + "loss": 0.8909, + "step": 3670 + }, + { + "epoch": 1.6640979147778785, + "grad_norm": 0.7249444205919668, + "learning_rate": 6.75017981026868e-05, + "loss": 0.9181, + "step": 3671 + }, + { + "epoch": 1.6645512239347235, + "grad_norm": 0.8057239256199427, + "learning_rate": 6.749260465635214e-05, + "loss": 0.8982, + "step": 3672 + }, + { + "epoch": 1.6650045330915684, + "grad_norm": 0.8449071467964603, + "learning_rate": 6.74834084565523e-05, + "loss": 0.8981, + "step": 3673 + }, + { + "epoch": 1.6654578422484134, + "grad_norm": 0.9051618252765106, + "learning_rate": 6.747420950420835e-05, + "loss": 0.8975, + "step": 3674 + }, + { + "epoch": 1.6659111514052585, + "grad_norm": 0.9487371173374164, + "learning_rate": 6.746500780024155e-05, + "loss": 0.8858, + "step": 3675 + }, + { + "epoch": 1.6663644605621033, + "grad_norm": 0.96498300909604, + "learning_rate": 6.745580334557352e-05, + "loss": 0.8719, + "step": 3676 + }, + { + "epoch": 1.6668177697189483, + "grad_norm": 0.8586068245256748, + "learning_rate": 6.744659614112608e-05, + "loss": 0.8952, + "step": 3677 + }, + { + "epoch": 1.6672710788757934, + "grad_norm": 0.8288546713736509, + "learning_rate": 6.743738618782137e-05, + "loss": 0.8907, + "step": 3678 + }, + { + "epoch": 1.6677243880326382, + "grad_norm": 0.7681528985546247, + "learning_rate": 6.742817348658181e-05, + "loss": 0.8754, + "step": 3679 + }, + { + "epoch": 1.6681776971894833, + "grad_norm": 0.85301562971831, + "learning_rate": 6.741895803833006e-05, + "loss": 0.8929, + "step": 3680 + }, + { + "epoch": 1.6686310063463283, + "grad_norm": 0.9428558596324974, + "learning_rate": 6.740973984398908e-05, + "loss": 0.8972, + "step": 3681 + }, + { + "epoch": 1.6690843155031732, + "grad_norm": 0.8594843763822574, + "learning_rate": 6.74005189044821e-05, + "loss": 0.9071, + "step": 3682 + }, + { + "epoch": 1.669537624660018, + "grad_norm": 0.6767798687296883, + "learning_rate": 6.739129522073262e-05, + "loss": 0.8949, + "step": 3683 + }, + { + "epoch": 1.6699909338168633, + "grad_norm": 0.5425994445064896, + "learning_rate": 6.738206879366442e-05, + "loss": 0.889, + "step": 3684 + }, + { + "epoch": 1.670444242973708, + "grad_norm": 0.5217898762513699, + "learning_rate": 6.737283962420156e-05, + "loss": 0.8958, + "step": 3685 + }, + { + "epoch": 1.670897552130553, + "grad_norm": 0.5351402152482628, + "learning_rate": 6.736360771326836e-05, + "loss": 0.8767, + "step": 3686 + }, + { + "epoch": 1.671350861287398, + "grad_norm": 0.4864511611165778, + "learning_rate": 6.735437306178943e-05, + "loss": 0.8989, + "step": 3687 + }, + { + "epoch": 1.671804170444243, + "grad_norm": 0.49233165087484243, + "learning_rate": 6.734513567068965e-05, + "loss": 0.8995, + "step": 3688 + }, + { + "epoch": 1.6722574796010878, + "grad_norm": 0.49099332595607836, + "learning_rate": 6.733589554089416e-05, + "loss": 0.9169, + "step": 3689 + }, + { + "epoch": 1.6727107887579329, + "grad_norm": 0.4334157734250715, + "learning_rate": 6.73266526733284e-05, + "loss": 0.9024, + "step": 3690 + }, + { + "epoch": 1.673164097914778, + "grad_norm": 0.5020563598375011, + "learning_rate": 6.731740706891806e-05, + "loss": 0.9058, + "step": 3691 + }, + { + "epoch": 1.6736174070716228, + "grad_norm": 0.6099600697250663, + "learning_rate": 6.730815872858912e-05, + "loss": 0.9061, + "step": 3692 + }, + { + "epoch": 1.6740707162284678, + "grad_norm": 0.7551759479782721, + "learning_rate": 6.729890765326782e-05, + "loss": 0.893, + "step": 3693 + }, + { + "epoch": 1.6745240253853129, + "grad_norm": 0.9432921798131219, + "learning_rate": 6.728965384388069e-05, + "loss": 0.8982, + "step": 3694 + }, + { + "epoch": 1.6749773345421577, + "grad_norm": 1.0980751787921423, + "learning_rate": 6.728039730135451e-05, + "loss": 0.8882, + "step": 3695 + }, + { + "epoch": 1.6754306436990027, + "grad_norm": 0.657972417673878, + "learning_rate": 6.72711380266164e-05, + "loss": 0.9084, + "step": 3696 + }, + { + "epoch": 1.6758839528558478, + "grad_norm": 0.4973651218380909, + "learning_rate": 6.726187602059364e-05, + "loss": 0.9147, + "step": 3697 + }, + { + "epoch": 1.6763372620126926, + "grad_norm": 1.4996818121112656, + "learning_rate": 6.725261128421389e-05, + "loss": 0.9363, + "step": 3698 + }, + { + "epoch": 1.6767905711695377, + "grad_norm": 0.4195629783117409, + "learning_rate": 6.724334381840501e-05, + "loss": 0.9018, + "step": 3699 + }, + { + "epoch": 1.6772438803263827, + "grad_norm": 0.7428413330413355, + "learning_rate": 6.72340736240952e-05, + "loss": 0.9029, + "step": 3700 + }, + { + "epoch": 1.6776971894832275, + "grad_norm": 1.0121418624966192, + "learning_rate": 6.722480070221287e-05, + "loss": 0.9312, + "step": 3701 + }, + { + "epoch": 1.6781504986400724, + "grad_norm": 1.0436729931387536, + "learning_rate": 6.721552505368673e-05, + "loss": 0.9252, + "step": 3702 + }, + { + "epoch": 1.6786038077969176, + "grad_norm": 1.0192355050113002, + "learning_rate": 6.720624667944577e-05, + "loss": 0.898, + "step": 3703 + }, + { + "epoch": 1.6790571169537625, + "grad_norm": 0.8969341625452598, + "learning_rate": 6.719696558041926e-05, + "loss": 0.9009, + "step": 3704 + }, + { + "epoch": 1.6795104261106073, + "grad_norm": 0.9116676032642704, + "learning_rate": 6.71876817575367e-05, + "loss": 0.9215, + "step": 3705 + }, + { + "epoch": 1.6799637352674524, + "grad_norm": 0.9597856994463121, + "learning_rate": 6.717839521172793e-05, + "loss": 0.895, + "step": 3706 + }, + { + "epoch": 1.6804170444242974, + "grad_norm": 1.0134529810888506, + "learning_rate": 6.7169105943923e-05, + "loss": 0.8933, + "step": 3707 + }, + { + "epoch": 1.6808703535811422, + "grad_norm": 0.8643560408003673, + "learning_rate": 6.715981395505224e-05, + "loss": 0.912, + "step": 3708 + }, + { + "epoch": 1.6813236627379873, + "grad_norm": 0.6983014904557351, + "learning_rate": 6.715051924604631e-05, + "loss": 0.9015, + "step": 3709 + }, + { + "epoch": 1.6817769718948323, + "grad_norm": 0.5949719716661933, + "learning_rate": 6.714122181783609e-05, + "loss": 0.8969, + "step": 3710 + }, + { + "epoch": 1.6822302810516772, + "grad_norm": 0.550660007948101, + "learning_rate": 6.713192167135271e-05, + "loss": 0.8784, + "step": 3711 + }, + { + "epoch": 1.6826835902085222, + "grad_norm": 0.5705232666567623, + "learning_rate": 6.712261880752765e-05, + "loss": 0.885, + "step": 3712 + }, + { + "epoch": 1.6831368993653673, + "grad_norm": 0.4739114508269938, + "learning_rate": 6.71133132272926e-05, + "loss": 0.8986, + "step": 3713 + }, + { + "epoch": 1.683590208522212, + "grad_norm": 0.33738919297694825, + "learning_rate": 6.710400493157956e-05, + "loss": 0.9058, + "step": 3714 + }, + { + "epoch": 1.6840435176790571, + "grad_norm": 0.37003132509339104, + "learning_rate": 6.709469392132076e-05, + "loss": 0.9218, + "step": 3715 + }, + { + "epoch": 1.6844968268359022, + "grad_norm": 0.43639486197780836, + "learning_rate": 6.708538019744873e-05, + "loss": 0.8943, + "step": 3716 + }, + { + "epoch": 1.684950135992747, + "grad_norm": 0.49065373955235875, + "learning_rate": 6.707606376089628e-05, + "loss": 0.8904, + "step": 3717 + }, + { + "epoch": 1.685403445149592, + "grad_norm": 0.48639268746078657, + "learning_rate": 6.706674461259647e-05, + "loss": 0.8884, + "step": 3718 + }, + { + "epoch": 1.685856754306437, + "grad_norm": 0.4487821005140561, + "learning_rate": 6.705742275348263e-05, + "loss": 0.8974, + "step": 3719 + }, + { + "epoch": 1.686310063463282, + "grad_norm": 0.42988843077441463, + "learning_rate": 6.704809818448838e-05, + "loss": 0.917, + "step": 3720 + }, + { + "epoch": 1.6867633726201268, + "grad_norm": 0.4445674090699786, + "learning_rate": 6.703877090654761e-05, + "loss": 0.9045, + "step": 3721 + }, + { + "epoch": 1.687216681776972, + "grad_norm": 0.48005739279514725, + "learning_rate": 6.702944092059447e-05, + "loss": 0.9056, + "step": 3722 + }, + { + "epoch": 1.6876699909338169, + "grad_norm": 0.5472104317864968, + "learning_rate": 6.702010822756339e-05, + "loss": 0.8906, + "step": 3723 + }, + { + "epoch": 1.6881233000906617, + "grad_norm": 0.6867702321684315, + "learning_rate": 6.701077282838905e-05, + "loss": 0.9036, + "step": 3724 + }, + { + "epoch": 1.688576609247507, + "grad_norm": 0.7597059566191885, + "learning_rate": 6.700143472400643e-05, + "loss": 0.9029, + "step": 3725 + }, + { + "epoch": 1.6890299184043518, + "grad_norm": 0.8504040607145836, + "learning_rate": 6.699209391535077e-05, + "loss": 0.9032, + "step": 3726 + }, + { + "epoch": 1.6894832275611966, + "grad_norm": 1.0485991739704106, + "learning_rate": 6.698275040335757e-05, + "loss": 0.8892, + "step": 3727 + }, + { + "epoch": 1.6899365367180417, + "grad_norm": 1.0861026237837617, + "learning_rate": 6.697340418896261e-05, + "loss": 0.8833, + "step": 3728 + }, + { + "epoch": 1.6903898458748867, + "grad_norm": 0.7860561153205056, + "learning_rate": 6.696405527310196e-05, + "loss": 0.9055, + "step": 3729 + }, + { + "epoch": 1.6908431550317315, + "grad_norm": 0.6616057298828738, + "learning_rate": 6.695470365671193e-05, + "loss": 0.8915, + "step": 3730 + }, + { + "epoch": 1.6912964641885766, + "grad_norm": 0.5803272888759361, + "learning_rate": 6.69453493407291e-05, + "loss": 0.9164, + "step": 3731 + }, + { + "epoch": 1.6917497733454216, + "grad_norm": 0.6309582024625459, + "learning_rate": 6.693599232609035e-05, + "loss": 0.9117, + "step": 3732 + }, + { + "epoch": 1.6922030825022665, + "grad_norm": 0.7504843073826812, + "learning_rate": 6.69266326137328e-05, + "loss": 0.8957, + "step": 3733 + }, + { + "epoch": 1.6926563916591115, + "grad_norm": 0.9046869842622001, + "learning_rate": 6.691727020459386e-05, + "loss": 0.879, + "step": 3734 + }, + { + "epoch": 1.6931097008159566, + "grad_norm": 1.0636558867607164, + "learning_rate": 6.69079050996112e-05, + "loss": 0.8748, + "step": 3735 + }, + { + "epoch": 1.6935630099728014, + "grad_norm": 0.8830307993916582, + "learning_rate": 6.689853729972276e-05, + "loss": 0.8873, + "step": 3736 + }, + { + "epoch": 1.6940163191296465, + "grad_norm": 0.6500230431042283, + "learning_rate": 6.688916680586676e-05, + "loss": 0.8913, + "step": 3737 + }, + { + "epoch": 1.6944696282864915, + "grad_norm": 0.4234680479567685, + "learning_rate": 6.687979361898167e-05, + "loss": 0.9043, + "step": 3738 + }, + { + "epoch": 1.6949229374433363, + "grad_norm": 0.5272422074278011, + "learning_rate": 6.687041774000627e-05, + "loss": 0.8944, + "step": 3739 + }, + { + "epoch": 1.6953762466001812, + "grad_norm": 0.6133772892180377, + "learning_rate": 6.686103916987956e-05, + "loss": 0.8907, + "step": 3740 + }, + { + "epoch": 1.6958295557570264, + "grad_norm": 0.6595386491798364, + "learning_rate": 6.685165790954083e-05, + "loss": 0.8936, + "step": 3741 + }, + { + "epoch": 1.6962828649138713, + "grad_norm": 0.5898973452993214, + "learning_rate": 6.684227395992963e-05, + "loss": 0.877, + "step": 3742 + }, + { + "epoch": 1.696736174070716, + "grad_norm": 0.4686395112769075, + "learning_rate": 6.683288732198583e-05, + "loss": 0.9048, + "step": 3743 + }, + { + "epoch": 1.6971894832275614, + "grad_norm": 0.4454289661900001, + "learning_rate": 6.682349799664947e-05, + "loss": 0.904, + "step": 3744 + }, + { + "epoch": 1.6976427923844062, + "grad_norm": 0.48150948527084936, + "learning_rate": 6.681410598486098e-05, + "loss": 0.8859, + "step": 3745 + }, + { + "epoch": 1.698096101541251, + "grad_norm": 0.5427687802380835, + "learning_rate": 6.680471128756097e-05, + "loss": 0.8955, + "step": 3746 + }, + { + "epoch": 1.698549410698096, + "grad_norm": 0.5763443598996874, + "learning_rate": 6.679531390569034e-05, + "loss": 0.9225, + "step": 3747 + }, + { + "epoch": 1.6990027198549411, + "grad_norm": 0.5182690295368858, + "learning_rate": 6.678591384019027e-05, + "loss": 0.9086, + "step": 3748 + }, + { + "epoch": 1.699456029011786, + "grad_norm": 0.5274359967336919, + "learning_rate": 6.677651109200222e-05, + "loss": 0.898, + "step": 3749 + }, + { + "epoch": 1.699909338168631, + "grad_norm": 0.694981777944451, + "learning_rate": 6.676710566206787e-05, + "loss": 0.8956, + "step": 3750 + }, + { + "epoch": 1.700362647325476, + "grad_norm": 0.8942531546597045, + "learning_rate": 6.675769755132922e-05, + "loss": 0.9109, + "step": 3751 + }, + { + "epoch": 1.7008159564823209, + "grad_norm": 0.9329848271114726, + "learning_rate": 6.674828676072853e-05, + "loss": 0.8916, + "step": 3752 + }, + { + "epoch": 1.701269265639166, + "grad_norm": 0.9854121780559459, + "learning_rate": 6.67388732912083e-05, + "loss": 0.9163, + "step": 3753 + }, + { + "epoch": 1.701722574796011, + "grad_norm": 1.0324827382169761, + "learning_rate": 6.672945714371136e-05, + "loss": 0.9254, + "step": 3754 + }, + { + "epoch": 1.7021758839528558, + "grad_norm": 0.914469077992546, + "learning_rate": 6.67200383191807e-05, + "loss": 0.8759, + "step": 3755 + }, + { + "epoch": 1.7026291931097008, + "grad_norm": 0.8039497076549215, + "learning_rate": 6.671061681855968e-05, + "loss": 0.886, + "step": 3756 + }, + { + "epoch": 1.703082502266546, + "grad_norm": 0.7949780318988283, + "learning_rate": 6.670119264279188e-05, + "loss": 0.9023, + "step": 3757 + }, + { + "epoch": 1.7035358114233907, + "grad_norm": 0.7945586081577648, + "learning_rate": 6.669176579282117e-05, + "loss": 0.896, + "step": 3758 + }, + { + "epoch": 1.7039891205802358, + "grad_norm": 0.7968649866852935, + "learning_rate": 6.668233626959166e-05, + "loss": 0.8751, + "step": 3759 + }, + { + "epoch": 1.7044424297370808, + "grad_norm": 0.737507347367117, + "learning_rate": 6.667290407404776e-05, + "loss": 0.8882, + "step": 3760 + }, + { + "epoch": 1.7048957388939256, + "grad_norm": 0.7591445747235319, + "learning_rate": 6.666346920713415e-05, + "loss": 0.8925, + "step": 3761 + }, + { + "epoch": 1.7053490480507705, + "grad_norm": 0.7263594783407955, + "learning_rate": 6.665403166979571e-05, + "loss": 0.8925, + "step": 3762 + }, + { + "epoch": 1.7058023572076157, + "grad_norm": 0.6835212290518361, + "learning_rate": 6.664459146297767e-05, + "loss": 0.8753, + "step": 3763 + }, + { + "epoch": 1.7062556663644606, + "grad_norm": 0.6243077507902149, + "learning_rate": 6.66351485876255e-05, + "loss": 0.9164, + "step": 3764 + }, + { + "epoch": 1.7067089755213054, + "grad_norm": 0.5839735247493731, + "learning_rate": 6.66257030446849e-05, + "loss": 0.9065, + "step": 3765 + }, + { + "epoch": 1.7071622846781505, + "grad_norm": 0.5240044092036779, + "learning_rate": 6.661625483510187e-05, + "loss": 0.9023, + "step": 3766 + }, + { + "epoch": 1.7076155938349955, + "grad_norm": 0.5047450028644846, + "learning_rate": 6.660680395982274e-05, + "loss": 0.8781, + "step": 3767 + }, + { + "epoch": 1.7080689029918403, + "grad_norm": 0.5207106640565379, + "learning_rate": 6.659735041979398e-05, + "loss": 0.8971, + "step": 3768 + }, + { + "epoch": 1.7085222121486854, + "grad_norm": 0.5467005168012489, + "learning_rate": 6.65878942159624e-05, + "loss": 0.895, + "step": 3769 + }, + { + "epoch": 1.7089755213055304, + "grad_norm": 0.46690669970698107, + "learning_rate": 6.657843534927507e-05, + "loss": 0.8672, + "step": 3770 + }, + { + "epoch": 1.7094288304623753, + "grad_norm": 0.5643991051559255, + "learning_rate": 6.656897382067935e-05, + "loss": 0.8775, + "step": 3771 + }, + { + "epoch": 1.7098821396192203, + "grad_norm": 0.6817922498998549, + "learning_rate": 6.65595096311228e-05, + "loss": 0.8938, + "step": 3772 + }, + { + "epoch": 1.7103354487760654, + "grad_norm": 0.8069649454047518, + "learning_rate": 6.65500427815533e-05, + "loss": 0.8992, + "step": 3773 + }, + { + "epoch": 1.7107887579329102, + "grad_norm": 0.878908968239035, + "learning_rate": 6.654057327291899e-05, + "loss": 0.9174, + "step": 3774 + }, + { + "epoch": 1.7112420670897552, + "grad_norm": 0.8866058537228136, + "learning_rate": 6.653110110616827e-05, + "loss": 0.8898, + "step": 3775 + }, + { + "epoch": 1.7116953762466003, + "grad_norm": 0.8110320637282362, + "learning_rate": 6.652162628224981e-05, + "loss": 0.8899, + "step": 3776 + }, + { + "epoch": 1.7121486854034451, + "grad_norm": 0.7982094768398954, + "learning_rate": 6.651214880211252e-05, + "loss": 0.9009, + "step": 3777 + }, + { + "epoch": 1.7126019945602902, + "grad_norm": 0.7736819572755326, + "learning_rate": 6.65026686667056e-05, + "loss": 0.9144, + "step": 3778 + }, + { + "epoch": 1.7130553037171352, + "grad_norm": 0.8239181825551296, + "learning_rate": 6.649318587697855e-05, + "loss": 0.9232, + "step": 3779 + }, + { + "epoch": 1.71350861287398, + "grad_norm": 0.8426099808775985, + "learning_rate": 6.648370043388104e-05, + "loss": 0.894, + "step": 3780 + }, + { + "epoch": 1.7139619220308249, + "grad_norm": 0.8289127373309025, + "learning_rate": 6.647421233836312e-05, + "loss": 0.8939, + "step": 3781 + }, + { + "epoch": 1.7144152311876701, + "grad_norm": 0.715196489769326, + "learning_rate": 6.646472159137502e-05, + "loss": 0.9109, + "step": 3782 + }, + { + "epoch": 1.714868540344515, + "grad_norm": 0.5926274015402634, + "learning_rate": 6.645522819386727e-05, + "loss": 0.8922, + "step": 3783 + }, + { + "epoch": 1.7153218495013598, + "grad_norm": 0.5016107833317854, + "learning_rate": 6.644573214679067e-05, + "loss": 0.9128, + "step": 3784 + }, + { + "epoch": 1.7157751586582048, + "grad_norm": 0.5577947121481835, + "learning_rate": 6.643623345109629e-05, + "loss": 0.8887, + "step": 3785 + }, + { + "epoch": 1.71622846781505, + "grad_norm": 0.6575413248058302, + "learning_rate": 6.642673210773541e-05, + "loss": 0.9023, + "step": 3786 + }, + { + "epoch": 1.7166817769718947, + "grad_norm": 0.6454858363201682, + "learning_rate": 6.641722811765966e-05, + "loss": 0.8794, + "step": 3787 + }, + { + "epoch": 1.7171350861287398, + "grad_norm": 0.5553553320804184, + "learning_rate": 6.640772148182086e-05, + "loss": 0.889, + "step": 3788 + }, + { + "epoch": 1.7175883952855848, + "grad_norm": 0.5743987620364641, + "learning_rate": 6.639821220117116e-05, + "loss": 0.8907, + "step": 3789 + }, + { + "epoch": 1.7180417044424297, + "grad_norm": 0.6109795366310983, + "learning_rate": 6.638870027666291e-05, + "loss": 0.8933, + "step": 3790 + }, + { + "epoch": 1.7184950135992747, + "grad_norm": 0.7174648515952795, + "learning_rate": 6.637918570924878e-05, + "loss": 0.8954, + "step": 3791 + }, + { + "epoch": 1.7189483227561198, + "grad_norm": 0.9948530487399134, + "learning_rate": 6.636966849988167e-05, + "loss": 0.9126, + "step": 3792 + }, + { + "epoch": 1.7194016319129646, + "grad_norm": 0.8561429046844133, + "learning_rate": 6.636014864951477e-05, + "loss": 0.8893, + "step": 3793 + }, + { + "epoch": 1.7198549410698096, + "grad_norm": 0.982655180864914, + "learning_rate": 6.63506261591015e-05, + "loss": 0.9013, + "step": 3794 + }, + { + "epoch": 1.7203082502266547, + "grad_norm": 0.8598433158700673, + "learning_rate": 6.634110102959559e-05, + "loss": 0.8881, + "step": 3795 + }, + { + "epoch": 1.7207615593834995, + "grad_norm": 0.7975662035490311, + "learning_rate": 6.633157326195098e-05, + "loss": 0.9031, + "step": 3796 + }, + { + "epoch": 1.7212148685403446, + "grad_norm": 0.8213767149444924, + "learning_rate": 6.632204285712194e-05, + "loss": 0.9025, + "step": 3797 + }, + { + "epoch": 1.7216681776971896, + "grad_norm": 0.8390603542069657, + "learning_rate": 6.631250981606294e-05, + "loss": 0.8822, + "step": 3798 + }, + { + "epoch": 1.7221214868540344, + "grad_norm": 0.8867047442405118, + "learning_rate": 6.630297413972875e-05, + "loss": 0.8941, + "step": 3799 + }, + { + "epoch": 1.7225747960108793, + "grad_norm": 0.8883591245994537, + "learning_rate": 6.629343582907439e-05, + "loss": 0.8666, + "step": 3800 + }, + { + "epoch": 1.7230281051677245, + "grad_norm": 0.9130053101224949, + "learning_rate": 6.628389488505517e-05, + "loss": 0.9061, + "step": 3801 + }, + { + "epoch": 1.7234814143245694, + "grad_norm": 0.9315465064681377, + "learning_rate": 6.627435130862661e-05, + "loss": 0.896, + "step": 3802 + }, + { + "epoch": 1.7239347234814142, + "grad_norm": 0.8855730886374048, + "learning_rate": 6.626480510074456e-05, + "loss": 0.885, + "step": 3803 + }, + { + "epoch": 1.7243880326382592, + "grad_norm": 0.7582865654331835, + "learning_rate": 6.62552562623651e-05, + "loss": 0.8988, + "step": 3804 + }, + { + "epoch": 1.7248413417951043, + "grad_norm": 0.6938300738007893, + "learning_rate": 6.624570479444455e-05, + "loss": 0.8962, + "step": 3805 + }, + { + "epoch": 1.7252946509519491, + "grad_norm": 0.6715901337908422, + "learning_rate": 6.623615069793954e-05, + "loss": 0.9029, + "step": 3806 + }, + { + "epoch": 1.7257479601087942, + "grad_norm": 0.6301748227819444, + "learning_rate": 6.62265939738069e-05, + "loss": 0.8972, + "step": 3807 + }, + { + "epoch": 1.7262012692656392, + "grad_norm": 0.5106145159005249, + "learning_rate": 6.621703462300382e-05, + "loss": 0.8871, + "step": 3808 + }, + { + "epoch": 1.726654578422484, + "grad_norm": 0.48617032380662056, + "learning_rate": 6.620747264648767e-05, + "loss": 0.8888, + "step": 3809 + }, + { + "epoch": 1.727107887579329, + "grad_norm": 0.4926378118149106, + "learning_rate": 6.619790804521612e-05, + "loss": 0.8999, + "step": 3810 + }, + { + "epoch": 1.7275611967361741, + "grad_norm": 0.5054492657733882, + "learning_rate": 6.618834082014709e-05, + "loss": 0.8948, + "step": 3811 + }, + { + "epoch": 1.728014505893019, + "grad_norm": 0.5104879890242653, + "learning_rate": 6.617877097223874e-05, + "loss": 0.915, + "step": 3812 + }, + { + "epoch": 1.728467815049864, + "grad_norm": 0.5808415523522088, + "learning_rate": 6.616919850244954e-05, + "loss": 0.8807, + "step": 3813 + }, + { + "epoch": 1.728921124206709, + "grad_norm": 0.5875159213497355, + "learning_rate": 6.61596234117382e-05, + "loss": 0.8986, + "step": 3814 + }, + { + "epoch": 1.729374433363554, + "grad_norm": 0.6019963313532509, + "learning_rate": 6.615004570106371e-05, + "loss": 0.9126, + "step": 3815 + }, + { + "epoch": 1.729827742520399, + "grad_norm": 0.5432024377365419, + "learning_rate": 6.614046537138528e-05, + "loss": 0.8859, + "step": 3816 + }, + { + "epoch": 1.730281051677244, + "grad_norm": 0.5211007307797942, + "learning_rate": 6.613088242366241e-05, + "loss": 0.8942, + "step": 3817 + }, + { + "epoch": 1.7307343608340888, + "grad_norm": 0.42882286005817083, + "learning_rate": 6.61212968588549e-05, + "loss": 0.8894, + "step": 3818 + }, + { + "epoch": 1.7311876699909337, + "grad_norm": 0.3705001444863107, + "learning_rate": 6.61117086779227e-05, + "loss": 0.8866, + "step": 3819 + }, + { + "epoch": 1.731640979147779, + "grad_norm": 0.3923278457054998, + "learning_rate": 6.610211788182614e-05, + "loss": 0.9002, + "step": 3820 + }, + { + "epoch": 1.7320942883046238, + "grad_norm": 0.4029522158664437, + "learning_rate": 6.609252447152578e-05, + "loss": 0.9009, + "step": 3821 + }, + { + "epoch": 1.7325475974614686, + "grad_norm": 0.42717431241190784, + "learning_rate": 6.60829284479824e-05, + "loss": 0.8927, + "step": 3822 + }, + { + "epoch": 1.7330009066183139, + "grad_norm": 0.45317075369825865, + "learning_rate": 6.607332981215708e-05, + "loss": 0.8899, + "step": 3823 + }, + { + "epoch": 1.7334542157751587, + "grad_norm": 0.4951660274807649, + "learning_rate": 6.606372856501116e-05, + "loss": 0.8998, + "step": 3824 + }, + { + "epoch": 1.7339075249320035, + "grad_norm": 0.5735722117630017, + "learning_rate": 6.605412470750622e-05, + "loss": 0.9015, + "step": 3825 + }, + { + "epoch": 1.7343608340888486, + "grad_norm": 0.7366905694397976, + "learning_rate": 6.604451824060411e-05, + "loss": 0.9062, + "step": 3826 + }, + { + "epoch": 1.7348141432456936, + "grad_norm": 0.8758209207466193, + "learning_rate": 6.603490916526697e-05, + "loss": 0.8928, + "step": 3827 + }, + { + "epoch": 1.7352674524025384, + "grad_norm": 0.9454182959991223, + "learning_rate": 6.602529748245716e-05, + "loss": 0.9013, + "step": 3828 + }, + { + "epoch": 1.7357207615593835, + "grad_norm": 1.113585441455221, + "learning_rate": 6.60156831931373e-05, + "loss": 0.8901, + "step": 3829 + }, + { + "epoch": 1.7361740707162285, + "grad_norm": 0.9720323064604871, + "learning_rate": 6.600606629827033e-05, + "loss": 0.8992, + "step": 3830 + }, + { + "epoch": 1.7366273798730734, + "grad_norm": 0.9101580292136993, + "learning_rate": 6.599644679881937e-05, + "loss": 0.8825, + "step": 3831 + }, + { + "epoch": 1.7370806890299184, + "grad_norm": 0.7768883089119636, + "learning_rate": 6.598682469574789e-05, + "loss": 0.9004, + "step": 3832 + }, + { + "epoch": 1.7375339981867635, + "grad_norm": 0.5823122702737005, + "learning_rate": 6.597719999001952e-05, + "loss": 0.8997, + "step": 3833 + }, + { + "epoch": 1.7379873073436083, + "grad_norm": 0.4080454070966192, + "learning_rate": 6.596757268259822e-05, + "loss": 0.8953, + "step": 3834 + }, + { + "epoch": 1.7384406165004533, + "grad_norm": 0.39501043267737174, + "learning_rate": 6.595794277444822e-05, + "loss": 0.8894, + "step": 3835 + }, + { + "epoch": 1.7388939256572984, + "grad_norm": 0.4899837841423306, + "learning_rate": 6.594831026653395e-05, + "loss": 0.8785, + "step": 3836 + }, + { + "epoch": 1.7393472348141432, + "grad_norm": 0.504535228683588, + "learning_rate": 6.593867515982016e-05, + "loss": 0.886, + "step": 3837 + }, + { + "epoch": 1.7398005439709883, + "grad_norm": 0.5359420183332634, + "learning_rate": 6.592903745527179e-05, + "loss": 0.9128, + "step": 3838 + }, + { + "epoch": 1.7402538531278333, + "grad_norm": 0.6097276592893291, + "learning_rate": 6.591939715385415e-05, + "loss": 0.883, + "step": 3839 + }, + { + "epoch": 1.7407071622846781, + "grad_norm": 0.6982149695653571, + "learning_rate": 6.590975425653269e-05, + "loss": 0.8857, + "step": 3840 + }, + { + "epoch": 1.741160471441523, + "grad_norm": 0.681480051396439, + "learning_rate": 6.59001087642732e-05, + "loss": 0.8971, + "step": 3841 + }, + { + "epoch": 1.7416137805983682, + "grad_norm": 0.6579463397659712, + "learning_rate": 6.58904606780417e-05, + "loss": 0.906, + "step": 3842 + }, + { + "epoch": 1.742067089755213, + "grad_norm": 0.6447873866444466, + "learning_rate": 6.588080999880445e-05, + "loss": 0.8931, + "step": 3843 + }, + { + "epoch": 1.742520398912058, + "grad_norm": 0.6658858645375951, + "learning_rate": 6.587115672752804e-05, + "loss": 0.9008, + "step": 3844 + }, + { + "epoch": 1.742973708068903, + "grad_norm": 0.727296068020592, + "learning_rate": 6.586150086517925e-05, + "loss": 0.8986, + "step": 3845 + }, + { + "epoch": 1.743427017225748, + "grad_norm": 0.7730567339317145, + "learning_rate": 6.585184241272514e-05, + "loss": 0.8883, + "step": 3846 + }, + { + "epoch": 1.7438803263825928, + "grad_norm": 0.7104772922343539, + "learning_rate": 6.584218137113302e-05, + "loss": 0.9167, + "step": 3847 + }, + { + "epoch": 1.7443336355394379, + "grad_norm": 0.636488543636627, + "learning_rate": 6.58325177413705e-05, + "loss": 0.8848, + "step": 3848 + }, + { + "epoch": 1.744786944696283, + "grad_norm": 0.5848480830171661, + "learning_rate": 6.582285152440541e-05, + "loss": 0.8964, + "step": 3849 + }, + { + "epoch": 1.7452402538531278, + "grad_norm": 0.5642859466717631, + "learning_rate": 6.581318272120584e-05, + "loss": 0.8862, + "step": 3850 + }, + { + "epoch": 1.7456935630099728, + "grad_norm": 0.605978126597611, + "learning_rate": 6.580351133274017e-05, + "loss": 0.9118, + "step": 3851 + }, + { + "epoch": 1.7461468721668179, + "grad_norm": 0.7174998870431452, + "learning_rate": 6.5793837359977e-05, + "loss": 0.8881, + "step": 3852 + }, + { + "epoch": 1.7466001813236627, + "grad_norm": 0.7507816945574268, + "learning_rate": 6.57841608038852e-05, + "loss": 0.9014, + "step": 3853 + }, + { + "epoch": 1.7470534904805077, + "grad_norm": 0.7381466820842393, + "learning_rate": 6.577448166543394e-05, + "loss": 0.8962, + "step": 3854 + }, + { + "epoch": 1.7475067996373528, + "grad_norm": 0.7061033703928796, + "learning_rate": 6.576479994559257e-05, + "loss": 0.9021, + "step": 3855 + }, + { + "epoch": 1.7479601087941976, + "grad_norm": 0.8017412590436906, + "learning_rate": 6.575511564533078e-05, + "loss": 0.9102, + "step": 3856 + }, + { + "epoch": 1.7484134179510427, + "grad_norm": 0.871050179415977, + "learning_rate": 6.574542876561844e-05, + "loss": 0.9043, + "step": 3857 + }, + { + "epoch": 1.7488667271078877, + "grad_norm": 0.9207077031891274, + "learning_rate": 6.573573930742576e-05, + "loss": 0.8842, + "step": 3858 + }, + { + "epoch": 1.7493200362647325, + "grad_norm": 0.9421116313845039, + "learning_rate": 6.572604727172316e-05, + "loss": 0.8725, + "step": 3859 + }, + { + "epoch": 1.7497733454215774, + "grad_norm": 0.8930242320511158, + "learning_rate": 6.571635265948131e-05, + "loss": 0.89, + "step": 3860 + }, + { + "epoch": 1.7502266545784226, + "grad_norm": 0.7314212836744549, + "learning_rate": 6.570665547167116e-05, + "loss": 0.8989, + "step": 3861 + }, + { + "epoch": 1.7506799637352675, + "grad_norm": 0.6052533026437444, + "learning_rate": 6.569695570926393e-05, + "loss": 0.8848, + "step": 3862 + }, + { + "epoch": 1.7511332728921123, + "grad_norm": 0.5954474443419195, + "learning_rate": 6.568725337323104e-05, + "loss": 0.9079, + "step": 3863 + }, + { + "epoch": 1.7515865820489573, + "grad_norm": 0.5933937915081496, + "learning_rate": 6.567754846454424e-05, + "loss": 0.8961, + "step": 3864 + }, + { + "epoch": 1.7520398912058024, + "grad_norm": 0.5033010811582114, + "learning_rate": 6.566784098417551e-05, + "loss": 0.9, + "step": 3865 + }, + { + "epoch": 1.7524932003626472, + "grad_norm": 0.6745480915928966, + "learning_rate": 6.565813093309706e-05, + "loss": 0.9094, + "step": 3866 + }, + { + "epoch": 1.7529465095194923, + "grad_norm": 0.5303658066288975, + "learning_rate": 6.564841831228139e-05, + "loss": 0.8985, + "step": 3867 + }, + { + "epoch": 1.7533998186763373, + "grad_norm": 0.4424898138340191, + "learning_rate": 6.563870312270124e-05, + "loss": 0.8916, + "step": 3868 + }, + { + "epoch": 1.7538531278331821, + "grad_norm": 0.4582157865908418, + "learning_rate": 6.562898536532962e-05, + "loss": 0.893, + "step": 3869 + }, + { + "epoch": 1.7543064369900272, + "grad_norm": 0.5163874887752307, + "learning_rate": 6.561926504113979e-05, + "loss": 0.9119, + "step": 3870 + }, + { + "epoch": 1.7547597461468722, + "grad_norm": 0.4967944932886949, + "learning_rate": 6.560954215110529e-05, + "loss": 0.8943, + "step": 3871 + }, + { + "epoch": 1.755213055303717, + "grad_norm": 0.46005451411944337, + "learning_rate": 6.559981669619988e-05, + "loss": 0.9008, + "step": 3872 + }, + { + "epoch": 1.7556663644605621, + "grad_norm": 0.4119585395967857, + "learning_rate": 6.559008867739758e-05, + "loss": 0.8984, + "step": 3873 + }, + { + "epoch": 1.7561196736174072, + "grad_norm": 0.4700367954175148, + "learning_rate": 6.55803580956727e-05, + "loss": 0.8788, + "step": 3874 + }, + { + "epoch": 1.756572982774252, + "grad_norm": 0.5362866873805032, + "learning_rate": 6.557062495199976e-05, + "loss": 0.9041, + "step": 3875 + }, + { + "epoch": 1.757026291931097, + "grad_norm": 0.6239473696858702, + "learning_rate": 6.556088924735358e-05, + "loss": 0.9041, + "step": 3876 + }, + { + "epoch": 1.757479601087942, + "grad_norm": 0.7469976206294955, + "learning_rate": 6.555115098270923e-05, + "loss": 0.8944, + "step": 3877 + }, + { + "epoch": 1.757932910244787, + "grad_norm": 0.8198536451310724, + "learning_rate": 6.554141015904203e-05, + "loss": 0.8999, + "step": 3878 + }, + { + "epoch": 1.7583862194016318, + "grad_norm": 0.8876635899111999, + "learning_rate": 6.553166677732752e-05, + "loss": 0.8948, + "step": 3879 + }, + { + "epoch": 1.758839528558477, + "grad_norm": 0.9551682706388037, + "learning_rate": 6.552192083854154e-05, + "loss": 0.8924, + "step": 3880 + }, + { + "epoch": 1.7592928377153219, + "grad_norm": 0.9824658207551072, + "learning_rate": 6.55121723436602e-05, + "loss": 0.8874, + "step": 3881 + }, + { + "epoch": 1.7597461468721667, + "grad_norm": 0.9320226247370559, + "learning_rate": 6.55024212936598e-05, + "loss": 0.8991, + "step": 3882 + }, + { + "epoch": 1.7601994560290117, + "grad_norm": 0.9726266399594202, + "learning_rate": 6.549266768951697e-05, + "loss": 0.8984, + "step": 3883 + }, + { + "epoch": 1.7606527651858568, + "grad_norm": 1.0195407268979537, + "learning_rate": 6.548291153220855e-05, + "loss": 0.8808, + "step": 3884 + }, + { + "epoch": 1.7611060743427016, + "grad_norm": 0.9587036970884857, + "learning_rate": 6.547315282271164e-05, + "loss": 0.9101, + "step": 3885 + }, + { + "epoch": 1.7615593834995467, + "grad_norm": 0.9304123116760276, + "learning_rate": 6.546339156200363e-05, + "loss": 0.9117, + "step": 3886 + }, + { + "epoch": 1.7620126926563917, + "grad_norm": 0.943472789322733, + "learning_rate": 6.54536277510621e-05, + "loss": 0.9086, + "step": 3887 + }, + { + "epoch": 1.7624660018132365, + "grad_norm": 0.9298708448652561, + "learning_rate": 6.544386139086494e-05, + "loss": 0.9079, + "step": 3888 + }, + { + "epoch": 1.7629193109700816, + "grad_norm": 0.9460490501966345, + "learning_rate": 6.54340924823903e-05, + "loss": 0.8933, + "step": 3889 + }, + { + "epoch": 1.7633726201269266, + "grad_norm": 0.8320295002592298, + "learning_rate": 6.542432102661653e-05, + "loss": 0.8906, + "step": 3890 + }, + { + "epoch": 1.7638259292837715, + "grad_norm": 0.7190855287729453, + "learning_rate": 6.54145470245223e-05, + "loss": 0.8845, + "step": 3891 + }, + { + "epoch": 1.7642792384406165, + "grad_norm": 0.6403115398602236, + "learning_rate": 6.540477047708651e-05, + "loss": 0.8954, + "step": 3892 + }, + { + "epoch": 1.7647325475974616, + "grad_norm": 0.6375405901569852, + "learning_rate": 6.539499138528828e-05, + "loss": 0.9114, + "step": 3893 + }, + { + "epoch": 1.7651858567543064, + "grad_norm": 0.872188920017808, + "learning_rate": 6.538520975010701e-05, + "loss": 0.9121, + "step": 3894 + }, + { + "epoch": 1.7656391659111514, + "grad_norm": 0.469801325133085, + "learning_rate": 6.53754255725224e-05, + "loss": 0.9084, + "step": 3895 + }, + { + "epoch": 1.7660924750679965, + "grad_norm": 0.42107942784942615, + "learning_rate": 6.536563885351433e-05, + "loss": 0.9142, + "step": 3896 + }, + { + "epoch": 1.7665457842248413, + "grad_norm": 0.47233665695579885, + "learning_rate": 6.535584959406299e-05, + "loss": 0.876, + "step": 3897 + }, + { + "epoch": 1.7669990933816861, + "grad_norm": 0.6114582228394982, + "learning_rate": 6.534605779514877e-05, + "loss": 0.8789, + "step": 3898 + }, + { + "epoch": 1.7674524025385314, + "grad_norm": 0.718382956217762, + "learning_rate": 6.53362634577524e-05, + "loss": 0.925, + "step": 3899 + }, + { + "epoch": 1.7679057116953762, + "grad_norm": 0.8638036880241637, + "learning_rate": 6.532646658285477e-05, + "loss": 0.8991, + "step": 3900 + }, + { + "epoch": 1.768359020852221, + "grad_norm": 0.9486379602176281, + "learning_rate": 6.531666717143708e-05, + "loss": 0.8928, + "step": 3901 + }, + { + "epoch": 1.7688123300090663, + "grad_norm": 0.9298539623374948, + "learning_rate": 6.530686522448078e-05, + "loss": 0.8997, + "step": 3902 + }, + { + "epoch": 1.7692656391659112, + "grad_norm": 0.9042247778597562, + "learning_rate": 6.529706074296754e-05, + "loss": 0.8943, + "step": 3903 + }, + { + "epoch": 1.769718948322756, + "grad_norm": 0.9174738351964395, + "learning_rate": 6.528725372787932e-05, + "loss": 0.9147, + "step": 3904 + }, + { + "epoch": 1.770172257479601, + "grad_norm": 0.9926497498819999, + "learning_rate": 6.527744418019832e-05, + "loss": 0.8756, + "step": 3905 + }, + { + "epoch": 1.770625566636446, + "grad_norm": 1.094004762405957, + "learning_rate": 6.5267632100907e-05, + "loss": 0.9217, + "step": 3906 + }, + { + "epoch": 1.771078875793291, + "grad_norm": 0.7251593759124922, + "learning_rate": 6.525781749098807e-05, + "loss": 0.9136, + "step": 3907 + }, + { + "epoch": 1.771532184950136, + "grad_norm": 0.5793960650985687, + "learning_rate": 6.524800035142448e-05, + "loss": 0.8949, + "step": 3908 + }, + { + "epoch": 1.771985494106981, + "grad_norm": 0.5178683987637865, + "learning_rate": 6.523818068319946e-05, + "loss": 0.8939, + "step": 3909 + }, + { + "epoch": 1.7724388032638259, + "grad_norm": 0.6214311945597412, + "learning_rate": 6.522835848729645e-05, + "loss": 0.9043, + "step": 3910 + }, + { + "epoch": 1.772892112420671, + "grad_norm": 0.7057102241480024, + "learning_rate": 6.521853376469921e-05, + "loss": 0.8951, + "step": 3911 + }, + { + "epoch": 1.773345421577516, + "grad_norm": 0.8509152630877603, + "learning_rate": 6.52087065163917e-05, + "loss": 0.9099, + "step": 3912 + }, + { + "epoch": 1.7737987307343608, + "grad_norm": 0.9916859420947791, + "learning_rate": 6.519887674335814e-05, + "loss": 0.9001, + "step": 3913 + }, + { + "epoch": 1.7742520398912058, + "grad_norm": 1.0505585402080069, + "learning_rate": 6.518904444658301e-05, + "loss": 0.9106, + "step": 3914 + }, + { + "epoch": 1.7747053490480509, + "grad_norm": 0.9483139640928896, + "learning_rate": 6.517920962705106e-05, + "loss": 0.8982, + "step": 3915 + }, + { + "epoch": 1.7751586582048957, + "grad_norm": 0.7123784610485726, + "learning_rate": 6.516937228574726e-05, + "loss": 0.899, + "step": 3916 + }, + { + "epoch": 1.7756119673617408, + "grad_norm": 0.47977992333010305, + "learning_rate": 6.515953242365686e-05, + "loss": 0.9056, + "step": 3917 + }, + { + "epoch": 1.7760652765185858, + "grad_norm": 0.3608516068576516, + "learning_rate": 6.514969004176534e-05, + "loss": 0.9068, + "step": 3918 + }, + { + "epoch": 1.7765185856754306, + "grad_norm": 0.5844397037459649, + "learning_rate": 6.513984514105844e-05, + "loss": 0.8917, + "step": 3919 + }, + { + "epoch": 1.7769718948322755, + "grad_norm": 0.7842930246762153, + "learning_rate": 6.512999772252217e-05, + "loss": 0.8913, + "step": 3920 + }, + { + "epoch": 1.7774252039891207, + "grad_norm": 0.8174609849640009, + "learning_rate": 6.512014778714278e-05, + "loss": 0.8963, + "step": 3921 + }, + { + "epoch": 1.7778785131459656, + "grad_norm": 0.7959454418458121, + "learning_rate": 6.511029533590676e-05, + "loss": 0.8905, + "step": 3922 + }, + { + "epoch": 1.7783318223028104, + "grad_norm": 0.8002540276501783, + "learning_rate": 6.510044036980087e-05, + "loss": 0.8982, + "step": 3923 + }, + { + "epoch": 1.7787851314596554, + "grad_norm": 0.7432932155279288, + "learning_rate": 6.50905828898121e-05, + "loss": 0.8828, + "step": 3924 + }, + { + "epoch": 1.7792384406165005, + "grad_norm": 0.5500938863394628, + "learning_rate": 6.508072289692772e-05, + "loss": 0.9032, + "step": 3925 + }, + { + "epoch": 1.7796917497733453, + "grad_norm": 0.40129343625680264, + "learning_rate": 6.507086039213522e-05, + "loss": 0.8936, + "step": 3926 + }, + { + "epoch": 1.7801450589301904, + "grad_norm": 0.38872327560774683, + "learning_rate": 6.506099537642238e-05, + "loss": 0.8961, + "step": 3927 + }, + { + "epoch": 1.7805983680870354, + "grad_norm": 0.46029416263473816, + "learning_rate": 6.505112785077719e-05, + "loss": 0.8912, + "step": 3928 + }, + { + "epoch": 1.7810516772438802, + "grad_norm": 0.5684589562134437, + "learning_rate": 6.504125781618793e-05, + "loss": 0.8945, + "step": 3929 + }, + { + "epoch": 1.7815049864007253, + "grad_norm": 0.6770585370003914, + "learning_rate": 6.50313852736431e-05, + "loss": 0.8916, + "step": 3930 + }, + { + "epoch": 1.7819582955575703, + "grad_norm": 0.685290463089577, + "learning_rate": 6.502151022413147e-05, + "loss": 0.9042, + "step": 3931 + }, + { + "epoch": 1.7824116047144152, + "grad_norm": 0.5879735787329992, + "learning_rate": 6.501163266864206e-05, + "loss": 0.8893, + "step": 3932 + }, + { + "epoch": 1.7828649138712602, + "grad_norm": 0.554686072251621, + "learning_rate": 6.500175260816413e-05, + "loss": 0.893, + "step": 3933 + }, + { + "epoch": 1.7833182230281053, + "grad_norm": 0.5696899695728963, + "learning_rate": 6.499187004368719e-05, + "loss": 0.9023, + "step": 3934 + }, + { + "epoch": 1.78377153218495, + "grad_norm": 0.5822993517865817, + "learning_rate": 6.498198497620102e-05, + "loss": 0.8874, + "step": 3935 + }, + { + "epoch": 1.7842248413417952, + "grad_norm": 0.5519207203245556, + "learning_rate": 6.497209740669563e-05, + "loss": 0.8727, + "step": 3936 + }, + { + "epoch": 1.7846781504986402, + "grad_norm": 0.6120127481447808, + "learning_rate": 6.49622073361613e-05, + "loss": 0.9013, + "step": 3937 + }, + { + "epoch": 1.785131459655485, + "grad_norm": 0.42395159286415424, + "learning_rate": 6.495231476558854e-05, + "loss": 0.9111, + "step": 3938 + }, + { + "epoch": 1.7855847688123299, + "grad_norm": 0.44431108615488907, + "learning_rate": 6.494241969596811e-05, + "loss": 0.8967, + "step": 3939 + }, + { + "epoch": 1.7860380779691751, + "grad_norm": 0.5220773862685488, + "learning_rate": 6.493252212829106e-05, + "loss": 0.9114, + "step": 3940 + }, + { + "epoch": 1.78649138712602, + "grad_norm": 0.6232735636145881, + "learning_rate": 6.492262206354863e-05, + "loss": 0.9126, + "step": 3941 + }, + { + "epoch": 1.7869446962828648, + "grad_norm": 0.7884000692950898, + "learning_rate": 6.491271950273237e-05, + "loss": 0.9024, + "step": 3942 + }, + { + "epoch": 1.7873980054397098, + "grad_norm": 0.7241839414131812, + "learning_rate": 6.490281444683403e-05, + "loss": 0.8994, + "step": 3943 + }, + { + "epoch": 1.7878513145965549, + "grad_norm": 0.7385736722287218, + "learning_rate": 6.489290689684563e-05, + "loss": 0.9019, + "step": 3944 + }, + { + "epoch": 1.7883046237533997, + "grad_norm": 0.7963783685677822, + "learning_rate": 6.488299685375944e-05, + "loss": 0.8968, + "step": 3945 + }, + { + "epoch": 1.7887579329102448, + "grad_norm": 0.808186177449346, + "learning_rate": 6.4873084318568e-05, + "loss": 0.8844, + "step": 3946 + }, + { + "epoch": 1.7892112420670898, + "grad_norm": 0.8346238493126195, + "learning_rate": 6.486316929226405e-05, + "loss": 0.8909, + "step": 3947 + }, + { + "epoch": 1.7896645512239346, + "grad_norm": 0.7681008277354979, + "learning_rate": 6.485325177584062e-05, + "loss": 0.8845, + "step": 3948 + }, + { + "epoch": 1.7901178603807797, + "grad_norm": 0.6874884834768893, + "learning_rate": 6.484333177029101e-05, + "loss": 0.9051, + "step": 3949 + }, + { + "epoch": 1.7905711695376247, + "grad_norm": 0.6403520358270441, + "learning_rate": 6.483340927660869e-05, + "loss": 0.9011, + "step": 3950 + }, + { + "epoch": 1.7910244786944696, + "grad_norm": 0.6528229207749708, + "learning_rate": 6.482348429578745e-05, + "loss": 0.9098, + "step": 3951 + }, + { + "epoch": 1.7914777878513146, + "grad_norm": 0.6405615949128707, + "learning_rate": 6.481355682882131e-05, + "loss": 0.874, + "step": 3952 + }, + { + "epoch": 1.7919310970081597, + "grad_norm": 0.6054370256202548, + "learning_rate": 6.480362687670453e-05, + "loss": 0.8984, + "step": 3953 + }, + { + "epoch": 1.7923844061650045, + "grad_norm": 0.6340399127883456, + "learning_rate": 6.47936944404316e-05, + "loss": 0.9066, + "step": 3954 + }, + { + "epoch": 1.7928377153218495, + "grad_norm": 0.7007778635819046, + "learning_rate": 6.47837595209973e-05, + "loss": 0.9177, + "step": 3955 + }, + { + "epoch": 1.7932910244786946, + "grad_norm": 0.5535012976324646, + "learning_rate": 6.477382211939667e-05, + "loss": 0.8881, + "step": 3956 + }, + { + "epoch": 1.7937443336355394, + "grad_norm": 0.5506734007753518, + "learning_rate": 6.476388223662494e-05, + "loss": 0.9148, + "step": 3957 + }, + { + "epoch": 1.7941976427923843, + "grad_norm": 0.6117856128478971, + "learning_rate": 6.475393987367763e-05, + "loss": 0.8924, + "step": 3958 + }, + { + "epoch": 1.7946509519492295, + "grad_norm": 0.6620927491664353, + "learning_rate": 6.47439950315505e-05, + "loss": 0.8998, + "step": 3959 + }, + { + "epoch": 1.7951042611060744, + "grad_norm": 0.6546913914300613, + "learning_rate": 6.473404771123951e-05, + "loss": 0.8878, + "step": 3960 + }, + { + "epoch": 1.7955575702629192, + "grad_norm": 0.567335505486501, + "learning_rate": 6.472409791374098e-05, + "loss": 0.8877, + "step": 3961 + }, + { + "epoch": 1.7960108794197642, + "grad_norm": 0.5326868279471841, + "learning_rate": 6.471414564005137e-05, + "loss": 0.8979, + "step": 3962 + }, + { + "epoch": 1.7964641885766093, + "grad_norm": 0.5443551461847739, + "learning_rate": 6.470419089116744e-05, + "loss": 0.8994, + "step": 3963 + }, + { + "epoch": 1.796917497733454, + "grad_norm": 0.5970877612755152, + "learning_rate": 6.469423366808618e-05, + "loss": 0.9036, + "step": 3964 + }, + { + "epoch": 1.7973708068902992, + "grad_norm": 0.7627803521191419, + "learning_rate": 6.468427397180487e-05, + "loss": 0.9115, + "step": 3965 + }, + { + "epoch": 1.7978241160471442, + "grad_norm": 0.7512052342770181, + "learning_rate": 6.467431180332095e-05, + "loss": 0.9003, + "step": 3966 + }, + { + "epoch": 1.798277425203989, + "grad_norm": 0.8586754655519705, + "learning_rate": 6.46643471636322e-05, + "loss": 0.8864, + "step": 3967 + }, + { + "epoch": 1.798730734360834, + "grad_norm": 1.0471944996308065, + "learning_rate": 6.465438005373658e-05, + "loss": 0.9052, + "step": 3968 + }, + { + "epoch": 1.7991840435176791, + "grad_norm": 0.9078548498210196, + "learning_rate": 6.464441047463237e-05, + "loss": 0.9086, + "step": 3969 + }, + { + "epoch": 1.799637352674524, + "grad_norm": 0.7710068412867059, + "learning_rate": 6.463443842731799e-05, + "loss": 0.8977, + "step": 3970 + }, + { + "epoch": 1.800090661831369, + "grad_norm": 0.812642241967833, + "learning_rate": 6.462446391279222e-05, + "loss": 0.8824, + "step": 3971 + }, + { + "epoch": 1.800543970988214, + "grad_norm": 1.017979015772262, + "learning_rate": 6.4614486932054e-05, + "loss": 0.8994, + "step": 3972 + }, + { + "epoch": 1.8009972801450589, + "grad_norm": 1.1314340541546892, + "learning_rate": 6.46045074861026e-05, + "loss": 0.8841, + "step": 3973 + }, + { + "epoch": 1.801450589301904, + "grad_norm": 0.7836557092500817, + "learning_rate": 6.459452557593745e-05, + "loss": 0.8612, + "step": 3974 + }, + { + "epoch": 1.801903898458749, + "grad_norm": 0.6113651801339595, + "learning_rate": 6.458454120255829e-05, + "loss": 0.888, + "step": 3975 + }, + { + "epoch": 1.8023572076155938, + "grad_norm": 0.6499903704622901, + "learning_rate": 6.457455436696506e-05, + "loss": 0.8939, + "step": 3976 + }, + { + "epoch": 1.8028105167724386, + "grad_norm": 0.7125541875823916, + "learning_rate": 6.456456507015798e-05, + "loss": 0.9063, + "step": 3977 + }, + { + "epoch": 1.803263825929284, + "grad_norm": 0.7521559991669844, + "learning_rate": 6.455457331313754e-05, + "loss": 0.8903, + "step": 3978 + }, + { + "epoch": 1.8037171350861287, + "grad_norm": 0.8517386614385944, + "learning_rate": 6.454457909690442e-05, + "loss": 0.8986, + "step": 3979 + }, + { + "epoch": 1.8041704442429736, + "grad_norm": 1.0139787251665733, + "learning_rate": 6.453458242245955e-05, + "loss": 0.8948, + "step": 3980 + }, + { + "epoch": 1.8046237533998188, + "grad_norm": 0.9946361537199938, + "learning_rate": 6.452458329080415e-05, + "loss": 0.9061, + "step": 3981 + }, + { + "epoch": 1.8050770625566637, + "grad_norm": 0.8545022008250847, + "learning_rate": 6.451458170293965e-05, + "loss": 0.899, + "step": 3982 + }, + { + "epoch": 1.8055303717135085, + "grad_norm": 0.7098983463628615, + "learning_rate": 6.450457765986777e-05, + "loss": 0.8808, + "step": 3983 + }, + { + "epoch": 1.8059836808703535, + "grad_norm": 0.5523863095907762, + "learning_rate": 6.449457116259039e-05, + "loss": 0.9038, + "step": 3984 + }, + { + "epoch": 1.8064369900271986, + "grad_norm": 0.6280048853213087, + "learning_rate": 6.448456221210974e-05, + "loss": 0.8873, + "step": 3985 + }, + { + "epoch": 1.8068902991840434, + "grad_norm": 0.6931585997955584, + "learning_rate": 6.447455080942821e-05, + "loss": 0.9055, + "step": 3986 + }, + { + "epoch": 1.8073436083408885, + "grad_norm": 0.7211032130473216, + "learning_rate": 6.44645369555485e-05, + "loss": 0.9071, + "step": 3987 + }, + { + "epoch": 1.8077969174977335, + "grad_norm": 0.7729065475579528, + "learning_rate": 6.44545206514735e-05, + "loss": 0.9081, + "step": 3988 + }, + { + "epoch": 1.8082502266545784, + "grad_norm": 0.8040985748073064, + "learning_rate": 6.444450189820641e-05, + "loss": 0.9149, + "step": 3989 + }, + { + "epoch": 1.8087035358114234, + "grad_norm": 0.8333131507364324, + "learning_rate": 6.443448069675059e-05, + "loss": 0.8932, + "step": 3990 + }, + { + "epoch": 1.8091568449682685, + "grad_norm": 0.7535030664754387, + "learning_rate": 6.442445704810973e-05, + "loss": 0.9171, + "step": 3991 + }, + { + "epoch": 1.8096101541251133, + "grad_norm": 0.602605520682078, + "learning_rate": 6.441443095328771e-05, + "loss": 0.912, + "step": 3992 + }, + { + "epoch": 1.8100634632819583, + "grad_norm": 0.46861442963762073, + "learning_rate": 6.440440241328867e-05, + "loss": 0.8876, + "step": 3993 + }, + { + "epoch": 1.8105167724388034, + "grad_norm": 0.44218978155660876, + "learning_rate": 6.439437142911703e-05, + "loss": 0.8717, + "step": 3994 + }, + { + "epoch": 1.8109700815956482, + "grad_norm": 0.5739836978064221, + "learning_rate": 6.438433800177736e-05, + "loss": 0.8949, + "step": 3995 + }, + { + "epoch": 1.811423390752493, + "grad_norm": 0.6553232868696338, + "learning_rate": 6.43743021322746e-05, + "loss": 0.8923, + "step": 3996 + }, + { + "epoch": 1.8118766999093383, + "grad_norm": 0.6555363807910665, + "learning_rate": 6.436426382161384e-05, + "loss": 0.8924, + "step": 3997 + }, + { + "epoch": 1.8123300090661831, + "grad_norm": 0.6092816084886911, + "learning_rate": 6.435422307080047e-05, + "loss": 0.9014, + "step": 3998 + }, + { + "epoch": 1.812783318223028, + "grad_norm": 0.5967538354458156, + "learning_rate": 6.434417988084006e-05, + "loss": 0.8921, + "step": 3999 + }, + { + "epoch": 1.8132366273798732, + "grad_norm": 0.6252993734760232, + "learning_rate": 6.433413425273851e-05, + "loss": 0.8992, + "step": 4000 + }, + { + "epoch": 1.813689936536718, + "grad_norm": 0.6438140435114073, + "learning_rate": 6.432408618750188e-05, + "loss": 0.9078, + "step": 4001 + }, + { + "epoch": 1.814143245693563, + "grad_norm": 0.6927700403637734, + "learning_rate": 6.431403568613654e-05, + "loss": 0.8883, + "step": 4002 + }, + { + "epoch": 1.814596554850408, + "grad_norm": 0.7466344306303481, + "learning_rate": 6.430398274964904e-05, + "loss": 0.8973, + "step": 4003 + }, + { + "epoch": 1.815049864007253, + "grad_norm": 0.7719106751957889, + "learning_rate": 6.429392737904628e-05, + "loss": 0.8942, + "step": 4004 + }, + { + "epoch": 1.8155031731640978, + "grad_norm": 0.7181594316748676, + "learning_rate": 6.428386957533528e-05, + "loss": 0.8778, + "step": 4005 + }, + { + "epoch": 1.8159564823209429, + "grad_norm": 0.6930944332429062, + "learning_rate": 6.427380933952336e-05, + "loss": 0.8868, + "step": 4006 + }, + { + "epoch": 1.816409791477788, + "grad_norm": 0.7235743861483859, + "learning_rate": 6.426374667261812e-05, + "loss": 0.9238, + "step": 4007 + }, + { + "epoch": 1.8168631006346327, + "grad_norm": 0.78843477642288, + "learning_rate": 6.425368157562731e-05, + "loss": 0.8862, + "step": 4008 + }, + { + "epoch": 1.8173164097914778, + "grad_norm": 0.8281171456660277, + "learning_rate": 6.424361404955904e-05, + "loss": 0.9034, + "step": 4009 + }, + { + "epoch": 1.8177697189483228, + "grad_norm": 0.8155061703253222, + "learning_rate": 6.423354409542157e-05, + "loss": 0.8941, + "step": 4010 + }, + { + "epoch": 1.8182230281051677, + "grad_norm": 0.6832736705955715, + "learning_rate": 6.422347171422343e-05, + "loss": 0.8951, + "step": 4011 + }, + { + "epoch": 1.8186763372620127, + "grad_norm": 0.5639723098737546, + "learning_rate": 6.42133969069734e-05, + "loss": 0.8893, + "step": 4012 + }, + { + "epoch": 1.8191296464188578, + "grad_norm": 0.4989781086249519, + "learning_rate": 6.420331967468051e-05, + "loss": 0.8901, + "step": 4013 + }, + { + "epoch": 1.8195829555757026, + "grad_norm": 0.5303575396065485, + "learning_rate": 6.419324001835403e-05, + "loss": 0.8893, + "step": 4014 + }, + { + "epoch": 1.8200362647325476, + "grad_norm": 0.4845481292112108, + "learning_rate": 6.418315793900345e-05, + "loss": 0.8833, + "step": 4015 + }, + { + "epoch": 1.8204895738893927, + "grad_norm": 0.421696371270079, + "learning_rate": 6.417307343763852e-05, + "loss": 0.8843, + "step": 4016 + }, + { + "epoch": 1.8209428830462375, + "grad_norm": 0.3890377689463516, + "learning_rate": 6.416298651526927e-05, + "loss": 0.8927, + "step": 4017 + }, + { + "epoch": 1.8213961922030824, + "grad_norm": 0.407188576784572, + "learning_rate": 6.415289717290589e-05, + "loss": 0.8928, + "step": 4018 + }, + { + "epoch": 1.8218495013599276, + "grad_norm": 0.4133410852250945, + "learning_rate": 6.414280541155888e-05, + "loss": 0.9053, + "step": 4019 + }, + { + "epoch": 1.8223028105167725, + "grad_norm": 0.5344800695513124, + "learning_rate": 6.413271123223894e-05, + "loss": 0.9182, + "step": 4020 + }, + { + "epoch": 1.8227561196736173, + "grad_norm": 0.6843912186681955, + "learning_rate": 6.412261463595706e-05, + "loss": 0.8926, + "step": 4021 + }, + { + "epoch": 1.8232094288304623, + "grad_norm": 0.787716722225233, + "learning_rate": 6.411251562372442e-05, + "loss": 0.9089, + "step": 4022 + }, + { + "epoch": 1.8236627379873074, + "grad_norm": 0.9102667090831926, + "learning_rate": 6.41024141965525e-05, + "loss": 0.9184, + "step": 4023 + }, + { + "epoch": 1.8241160471441522, + "grad_norm": 1.0049653720687963, + "learning_rate": 6.409231035545292e-05, + "loss": 0.8875, + "step": 4024 + }, + { + "epoch": 1.8245693563009973, + "grad_norm": 1.024374646820244, + "learning_rate": 6.408220410143768e-05, + "loss": 0.9186, + "step": 4025 + }, + { + "epoch": 1.8250226654578423, + "grad_norm": 0.9469995507119913, + "learning_rate": 6.407209543551892e-05, + "loss": 0.9217, + "step": 4026 + }, + { + "epoch": 1.8254759746146871, + "grad_norm": 0.8226472453651121, + "learning_rate": 6.406198435870905e-05, + "loss": 0.8806, + "step": 4027 + }, + { + "epoch": 1.8259292837715322, + "grad_norm": 0.698420575060796, + "learning_rate": 6.405187087202074e-05, + "loss": 0.892, + "step": 4028 + }, + { + "epoch": 1.8263825929283772, + "grad_norm": 0.5428562348751574, + "learning_rate": 6.404175497646686e-05, + "loss": 0.8863, + "step": 4029 + }, + { + "epoch": 1.826835902085222, + "grad_norm": 0.4956474861110213, + "learning_rate": 6.403163667306057e-05, + "loss": 0.8984, + "step": 4030 + }, + { + "epoch": 1.8272892112420671, + "grad_norm": 0.475712966491124, + "learning_rate": 6.402151596281524e-05, + "loss": 0.8926, + "step": 4031 + }, + { + "epoch": 1.8277425203989122, + "grad_norm": 0.4445182669393753, + "learning_rate": 6.401139284674448e-05, + "loss": 0.8843, + "step": 4032 + }, + { + "epoch": 1.828195829555757, + "grad_norm": 0.44391687738554175, + "learning_rate": 6.400126732586217e-05, + "loss": 0.8832, + "step": 4033 + }, + { + "epoch": 1.828649138712602, + "grad_norm": 0.47538269588346627, + "learning_rate": 6.39911394011824e-05, + "loss": 0.869, + "step": 4034 + }, + { + "epoch": 1.829102447869447, + "grad_norm": 0.49111409393546696, + "learning_rate": 6.39810090737195e-05, + "loss": 0.8998, + "step": 4035 + }, + { + "epoch": 1.829555757026292, + "grad_norm": 0.44058060834510626, + "learning_rate": 6.397087634448806e-05, + "loss": 0.9127, + "step": 4036 + }, + { + "epoch": 1.8300090661831367, + "grad_norm": 0.3570800723094871, + "learning_rate": 6.396074121450292e-05, + "loss": 0.8955, + "step": 4037 + }, + { + "epoch": 1.830462375339982, + "grad_norm": 0.4227033251185345, + "learning_rate": 6.395060368477912e-05, + "loss": 0.9199, + "step": 4038 + }, + { + "epoch": 1.8309156844968268, + "grad_norm": 0.43747116863567637, + "learning_rate": 6.394046375633196e-05, + "loss": 0.8944, + "step": 4039 + }, + { + "epoch": 1.8313689936536717, + "grad_norm": 0.33313756401294037, + "learning_rate": 6.393032143017701e-05, + "loss": 0.862, + "step": 4040 + }, + { + "epoch": 1.8318223028105167, + "grad_norm": 0.3702399352543698, + "learning_rate": 6.392017670733003e-05, + "loss": 0.8666, + "step": 4041 + }, + { + "epoch": 1.8322756119673618, + "grad_norm": 0.43761074050401927, + "learning_rate": 6.391002958880707e-05, + "loss": 0.8656, + "step": 4042 + }, + { + "epoch": 1.8327289211242066, + "grad_norm": 0.40209711318489816, + "learning_rate": 6.389988007562435e-05, + "loss": 0.8875, + "step": 4043 + }, + { + "epoch": 1.8331822302810517, + "grad_norm": 0.46532609912568224, + "learning_rate": 6.388972816879841e-05, + "loss": 0.8992, + "step": 4044 + }, + { + "epoch": 1.8336355394378967, + "grad_norm": 0.5425773163951751, + "learning_rate": 6.3879573869346e-05, + "loss": 0.8707, + "step": 4045 + }, + { + "epoch": 1.8340888485947415, + "grad_norm": 0.4820382744975377, + "learning_rate": 6.386941717828407e-05, + "loss": 0.8757, + "step": 4046 + }, + { + "epoch": 1.8345421577515866, + "grad_norm": 0.5818563067246578, + "learning_rate": 6.385925809662987e-05, + "loss": 0.8915, + "step": 4047 + }, + { + "epoch": 1.8349954669084316, + "grad_norm": 0.6938761165412745, + "learning_rate": 6.384909662540084e-05, + "loss": 0.8848, + "step": 4048 + }, + { + "epoch": 1.8354487760652765, + "grad_norm": 0.6473807163681281, + "learning_rate": 6.38389327656147e-05, + "loss": 0.8981, + "step": 4049 + }, + { + "epoch": 1.8359020852221215, + "grad_norm": 0.6329374696086039, + "learning_rate": 6.382876651828938e-05, + "loss": 0.87, + "step": 4050 + }, + { + "epoch": 1.8363553943789666, + "grad_norm": 0.6947342830614672, + "learning_rate": 6.381859788444305e-05, + "loss": 0.8948, + "step": 4051 + }, + { + "epoch": 1.8368087035358114, + "grad_norm": 0.751601463875255, + "learning_rate": 6.380842686509414e-05, + "loss": 0.8951, + "step": 4052 + }, + { + "epoch": 1.8372620126926564, + "grad_norm": 0.8946998005947113, + "learning_rate": 6.379825346126133e-05, + "loss": 0.8921, + "step": 4053 + }, + { + "epoch": 1.8377153218495015, + "grad_norm": 0.9941562268817906, + "learning_rate": 6.378807767396347e-05, + "loss": 0.872, + "step": 4054 + }, + { + "epoch": 1.8381686310063463, + "grad_norm": 1.0472902983085444, + "learning_rate": 6.377789950421972e-05, + "loss": 0.8805, + "step": 4055 + }, + { + "epoch": 1.8386219401631911, + "grad_norm": 0.8809094161040311, + "learning_rate": 6.376771895304948e-05, + "loss": 0.9043, + "step": 4056 + }, + { + "epoch": 1.8390752493200364, + "grad_norm": 0.7751679773248243, + "learning_rate": 6.375753602147232e-05, + "loss": 0.9096, + "step": 4057 + }, + { + "epoch": 1.8395285584768812, + "grad_norm": 0.5577774732327065, + "learning_rate": 6.374735071050809e-05, + "loss": 0.8965, + "step": 4058 + }, + { + "epoch": 1.839981867633726, + "grad_norm": 0.4919990441430391, + "learning_rate": 6.373716302117691e-05, + "loss": 0.9028, + "step": 4059 + }, + { + "epoch": 1.8404351767905711, + "grad_norm": 0.5017142979905649, + "learning_rate": 6.372697295449909e-05, + "loss": 0.9224, + "step": 4060 + }, + { + "epoch": 1.8408884859474162, + "grad_norm": 0.5382197372855064, + "learning_rate": 6.37167805114952e-05, + "loss": 0.8874, + "step": 4061 + }, + { + "epoch": 1.841341795104261, + "grad_norm": 0.47775932587519826, + "learning_rate": 6.370658569318602e-05, + "loss": 0.9048, + "step": 4062 + }, + { + "epoch": 1.841795104261106, + "grad_norm": 0.41785250529954937, + "learning_rate": 6.369638850059263e-05, + "loss": 0.878, + "step": 4063 + }, + { + "epoch": 1.842248413417951, + "grad_norm": 0.5508545956692398, + "learning_rate": 6.368618893473629e-05, + "loss": 0.9026, + "step": 4064 + }, + { + "epoch": 1.842701722574796, + "grad_norm": 0.4395094233407126, + "learning_rate": 6.367598699663852e-05, + "loss": 0.8814, + "step": 4065 + }, + { + "epoch": 1.843155031731641, + "grad_norm": 0.42291348854895067, + "learning_rate": 6.366578268732104e-05, + "loss": 0.9039, + "step": 4066 + }, + { + "epoch": 1.843608340888486, + "grad_norm": 0.4517971092596902, + "learning_rate": 6.365557600780589e-05, + "loss": 0.8733, + "step": 4067 + }, + { + "epoch": 1.8440616500453308, + "grad_norm": 0.42547289245694814, + "learning_rate": 6.36453669591153e-05, + "loss": 0.8638, + "step": 4068 + }, + { + "epoch": 1.844514959202176, + "grad_norm": 0.540508620769893, + "learning_rate": 6.363515554227169e-05, + "loss": 0.89, + "step": 4069 + }, + { + "epoch": 1.844968268359021, + "grad_norm": 0.6533314298535922, + "learning_rate": 6.362494175829778e-05, + "loss": 0.8924, + "step": 4070 + }, + { + "epoch": 1.8454215775158658, + "grad_norm": 0.7386236500467155, + "learning_rate": 6.361472560821656e-05, + "loss": 0.8894, + "step": 4071 + }, + { + "epoch": 1.8458748866727108, + "grad_norm": 0.7699710023062802, + "learning_rate": 6.360450709305113e-05, + "loss": 0.8943, + "step": 4072 + }, + { + "epoch": 1.8463281958295559, + "grad_norm": 0.858233340806284, + "learning_rate": 6.359428621382495e-05, + "loss": 0.8934, + "step": 4073 + }, + { + "epoch": 1.8467815049864007, + "grad_norm": 0.9797970337084024, + "learning_rate": 6.358406297156169e-05, + "loss": 0.8992, + "step": 4074 + }, + { + "epoch": 1.8472348141432455, + "grad_norm": 1.0347782972689834, + "learning_rate": 6.35738373672852e-05, + "loss": 0.8917, + "step": 4075 + }, + { + "epoch": 1.8476881233000908, + "grad_norm": 0.9712485973767905, + "learning_rate": 6.35636094020196e-05, + "loss": 0.8907, + "step": 4076 + }, + { + "epoch": 1.8481414324569356, + "grad_norm": 0.9601693709074832, + "learning_rate": 6.355337907678927e-05, + "loss": 0.8769, + "step": 4077 + }, + { + "epoch": 1.8485947416137805, + "grad_norm": 0.79826809137413, + "learning_rate": 6.354314639261881e-05, + "loss": 0.8672, + "step": 4078 + }, + { + "epoch": 1.8490480507706257, + "grad_norm": 0.6729959237535736, + "learning_rate": 6.353291135053304e-05, + "loss": 0.8902, + "step": 4079 + }, + { + "epoch": 1.8495013599274706, + "grad_norm": 0.5871278593760876, + "learning_rate": 6.352267395155703e-05, + "loss": 0.8761, + "step": 4080 + }, + { + "epoch": 1.8499546690843154, + "grad_norm": 0.4612683591332279, + "learning_rate": 6.351243419671611e-05, + "loss": 0.8873, + "step": 4081 + }, + { + "epoch": 1.8504079782411604, + "grad_norm": 0.5546342804051085, + "learning_rate": 6.35021920870358e-05, + "loss": 0.8869, + "step": 4082 + }, + { + "epoch": 1.8508612873980055, + "grad_norm": 0.5252835622007773, + "learning_rate": 6.349194762354187e-05, + "loss": 0.8905, + "step": 4083 + }, + { + "epoch": 1.8513145965548503, + "grad_norm": 0.6521462851736936, + "learning_rate": 6.348170080726035e-05, + "loss": 0.8905, + "step": 4084 + }, + { + "epoch": 1.8517679057116954, + "grad_norm": 0.684383622330021, + "learning_rate": 6.34714516392175e-05, + "loss": 0.8774, + "step": 4085 + }, + { + "epoch": 1.8522212148685404, + "grad_norm": 0.7419957387621071, + "learning_rate": 6.346120012043976e-05, + "loss": 0.9064, + "step": 4086 + }, + { + "epoch": 1.8526745240253852, + "grad_norm": 0.7139043092854056, + "learning_rate": 6.345094625195389e-05, + "loss": 0.8819, + "step": 4087 + }, + { + "epoch": 1.8531278331822303, + "grad_norm": 0.8093266030050058, + "learning_rate": 6.344069003478683e-05, + "loss": 0.9009, + "step": 4088 + }, + { + "epoch": 1.8535811423390753, + "grad_norm": 0.8647752045540145, + "learning_rate": 6.343043146996577e-05, + "loss": 0.91, + "step": 4089 + }, + { + "epoch": 1.8540344514959202, + "grad_norm": 0.8886768566573897, + "learning_rate": 6.342017055851814e-05, + "loss": 0.8904, + "step": 4090 + }, + { + "epoch": 1.8544877606527652, + "grad_norm": 0.8766134205964513, + "learning_rate": 6.340990730147159e-05, + "loss": 0.892, + "step": 4091 + }, + { + "epoch": 1.8549410698096103, + "grad_norm": 0.7986308770737333, + "learning_rate": 6.339964169985403e-05, + "loss": 0.8793, + "step": 4092 + }, + { + "epoch": 1.855394378966455, + "grad_norm": 0.6747796642476526, + "learning_rate": 6.338937375469359e-05, + "loss": 0.8998, + "step": 4093 + }, + { + "epoch": 1.8558476881233001, + "grad_norm": 0.6429973597318264, + "learning_rate": 6.337910346701862e-05, + "loss": 0.8951, + "step": 4094 + }, + { + "epoch": 1.8563009972801452, + "grad_norm": 0.6343248615393162, + "learning_rate": 6.336883083785773e-05, + "loss": 0.8906, + "step": 4095 + }, + { + "epoch": 1.85675430643699, + "grad_norm": 0.5938451541122313, + "learning_rate": 6.335855586823976e-05, + "loss": 0.8775, + "step": 4096 + }, + { + "epoch": 1.8572076155938348, + "grad_norm": 0.5511146992139071, + "learning_rate": 6.334827855919375e-05, + "loss": 0.9071, + "step": 4097 + }, + { + "epoch": 1.8576609247506801, + "grad_norm": 0.4784759966541587, + "learning_rate": 6.333799891174905e-05, + "loss": 0.9097, + "step": 4098 + }, + { + "epoch": 1.858114233907525, + "grad_norm": 0.43914737942104065, + "learning_rate": 6.332771692693515e-05, + "loss": 0.8961, + "step": 4099 + }, + { + "epoch": 1.8585675430643698, + "grad_norm": 0.406110022516985, + "learning_rate": 6.331743260578184e-05, + "loss": 0.9183, + "step": 4100 + }, + { + "epoch": 1.8590208522212148, + "grad_norm": 0.5429753823063206, + "learning_rate": 6.330714594931913e-05, + "loss": 0.9121, + "step": 4101 + }, + { + "epoch": 1.8594741613780599, + "grad_norm": 0.6871950007371797, + "learning_rate": 6.329685695857726e-05, + "loss": 0.9198, + "step": 4102 + }, + { + "epoch": 1.8599274705349047, + "grad_norm": 0.6717183090388308, + "learning_rate": 6.328656563458668e-05, + "loss": 0.9146, + "step": 4103 + }, + { + "epoch": 1.8603807796917498, + "grad_norm": 0.5740461762305292, + "learning_rate": 6.327627197837814e-05, + "loss": 0.8923, + "step": 4104 + }, + { + "epoch": 1.8608340888485948, + "grad_norm": 0.48804160834297444, + "learning_rate": 6.326597599098254e-05, + "loss": 0.8797, + "step": 4105 + }, + { + "epoch": 1.8612873980054396, + "grad_norm": 0.541993031295347, + "learning_rate": 6.325567767343106e-05, + "loss": 0.9005, + "step": 4106 + }, + { + "epoch": 1.8617407071622847, + "grad_norm": 0.5812577122013832, + "learning_rate": 6.324537702675512e-05, + "loss": 0.8786, + "step": 4107 + }, + { + "epoch": 1.8621940163191297, + "grad_norm": 0.5512877364050504, + "learning_rate": 6.323507405198636e-05, + "loss": 0.8914, + "step": 4108 + }, + { + "epoch": 1.8626473254759746, + "grad_norm": 0.6095368961478793, + "learning_rate": 6.322476875015663e-05, + "loss": 0.8826, + "step": 4109 + }, + { + "epoch": 1.8631006346328196, + "grad_norm": 0.7043681715200577, + "learning_rate": 6.321446112229808e-05, + "loss": 0.8842, + "step": 4110 + }, + { + "epoch": 1.8635539437896647, + "grad_norm": 0.8401566217015518, + "learning_rate": 6.320415116944301e-05, + "loss": 0.8867, + "step": 4111 + }, + { + "epoch": 1.8640072529465095, + "grad_norm": 0.8743835914449349, + "learning_rate": 6.319383889262399e-05, + "loss": 0.8788, + "step": 4112 + }, + { + "epoch": 1.8644605621033545, + "grad_norm": 0.868300313749438, + "learning_rate": 6.318352429287387e-05, + "loss": 0.8871, + "step": 4113 + }, + { + "epoch": 1.8649138712601996, + "grad_norm": 0.8510918845349745, + "learning_rate": 6.317320737122565e-05, + "loss": 0.879, + "step": 4114 + }, + { + "epoch": 1.8653671804170444, + "grad_norm": 0.8016918761891219, + "learning_rate": 6.316288812871258e-05, + "loss": 0.8876, + "step": 4115 + }, + { + "epoch": 1.8658204895738892, + "grad_norm": 0.673972999657549, + "learning_rate": 6.315256656636821e-05, + "loss": 0.8905, + "step": 4116 + }, + { + "epoch": 1.8662737987307345, + "grad_norm": 0.6305804606509811, + "learning_rate": 6.314224268522624e-05, + "loss": 0.9045, + "step": 4117 + }, + { + "epoch": 1.8667271078875793, + "grad_norm": 0.6220573310745142, + "learning_rate": 6.313191648632067e-05, + "loss": 0.9067, + "step": 4118 + }, + { + "epoch": 1.8671804170444242, + "grad_norm": 0.6226966931583, + "learning_rate": 6.312158797068567e-05, + "loss": 0.8976, + "step": 4119 + }, + { + "epoch": 1.8676337262012692, + "grad_norm": 0.5655378787800734, + "learning_rate": 6.311125713935569e-05, + "loss": 0.886, + "step": 4120 + }, + { + "epoch": 1.8680870353581143, + "grad_norm": 0.528000777501247, + "learning_rate": 6.310092399336537e-05, + "loss": 0.8897, + "step": 4121 + }, + { + "epoch": 1.868540344514959, + "grad_norm": 0.5666141494314196, + "learning_rate": 6.309058853374963e-05, + "loss": 0.9114, + "step": 4122 + }, + { + "epoch": 1.8689936536718041, + "grad_norm": 0.5363004619968242, + "learning_rate": 6.308025076154358e-05, + "loss": 0.9001, + "step": 4123 + }, + { + "epoch": 1.8694469628286492, + "grad_norm": 0.4356187038939563, + "learning_rate": 6.30699106777826e-05, + "loss": 0.8883, + "step": 4124 + }, + { + "epoch": 1.869900271985494, + "grad_norm": 0.414889775620737, + "learning_rate": 6.305956828350224e-05, + "loss": 0.8995, + "step": 4125 + }, + { + "epoch": 1.870353581142339, + "grad_norm": 0.4322731374612673, + "learning_rate": 6.304922357973837e-05, + "loss": 0.9046, + "step": 4126 + }, + { + "epoch": 1.8708068902991841, + "grad_norm": 0.4668489990958511, + "learning_rate": 6.303887656752701e-05, + "loss": 0.8641, + "step": 4127 + }, + { + "epoch": 1.871260199456029, + "grad_norm": 0.4960658713108177, + "learning_rate": 6.302852724790444e-05, + "loss": 0.898, + "step": 4128 + }, + { + "epoch": 1.871713508612874, + "grad_norm": 0.5717444253938594, + "learning_rate": 6.30181756219072e-05, + "loss": 0.8887, + "step": 4129 + }, + { + "epoch": 1.872166817769719, + "grad_norm": 0.5845325712944254, + "learning_rate": 6.300782169057202e-05, + "loss": 0.9025, + "step": 4130 + }, + { + "epoch": 1.8726201269265639, + "grad_norm": 0.5062744504234511, + "learning_rate": 6.299746545493587e-05, + "loss": 0.8981, + "step": 4131 + }, + { + "epoch": 1.873073436083409, + "grad_norm": 0.41102953821964666, + "learning_rate": 6.298710691603597e-05, + "loss": 0.8789, + "step": 4132 + }, + { + "epoch": 1.873526745240254, + "grad_norm": 0.3039689982001319, + "learning_rate": 6.297674607490977e-05, + "loss": 0.9131, + "step": 4133 + }, + { + "epoch": 1.8739800543970988, + "grad_norm": 0.32089866194592365, + "learning_rate": 6.29663829325949e-05, + "loss": 0.8968, + "step": 4134 + }, + { + "epoch": 1.8744333635539436, + "grad_norm": 0.4604856098964121, + "learning_rate": 6.29560174901293e-05, + "loss": 0.9017, + "step": 4135 + }, + { + "epoch": 1.874886672710789, + "grad_norm": 0.5523610786462327, + "learning_rate": 6.294564974855108e-05, + "loss": 0.9042, + "step": 4136 + }, + { + "epoch": 1.8753399818676337, + "grad_norm": 0.5527145208836146, + "learning_rate": 6.293527970889859e-05, + "loss": 0.8923, + "step": 4137 + }, + { + "epoch": 1.8757932910244786, + "grad_norm": 0.590473624662427, + "learning_rate": 6.292490737221043e-05, + "loss": 0.9003, + "step": 4138 + }, + { + "epoch": 1.8762466001813236, + "grad_norm": 0.6178258083868079, + "learning_rate": 6.291453273952544e-05, + "loss": 0.8914, + "step": 4139 + }, + { + "epoch": 1.8766999093381687, + "grad_norm": 0.5564945028463888, + "learning_rate": 6.290415581188264e-05, + "loss": 0.8988, + "step": 4140 + }, + { + "epoch": 1.8771532184950135, + "grad_norm": 0.4336021283986337, + "learning_rate": 6.289377659032133e-05, + "loss": 0.8712, + "step": 4141 + }, + { + "epoch": 1.8776065276518585, + "grad_norm": 0.422135143551568, + "learning_rate": 6.2883395075881e-05, + "loss": 0.9105, + "step": 4142 + }, + { + "epoch": 1.8780598368087036, + "grad_norm": 0.4610779728660864, + "learning_rate": 6.28730112696014e-05, + "loss": 0.8995, + "step": 4143 + }, + { + "epoch": 1.8785131459655484, + "grad_norm": 0.4426598163732573, + "learning_rate": 6.286262517252251e-05, + "loss": 0.8886, + "step": 4144 + }, + { + "epoch": 1.8789664551223935, + "grad_norm": 0.4878034624739042, + "learning_rate": 6.28522367856845e-05, + "loss": 0.874, + "step": 4145 + }, + { + "epoch": 1.8794197642792385, + "grad_norm": 0.4811249425591701, + "learning_rate": 6.284184611012783e-05, + "loss": 0.8617, + "step": 4146 + }, + { + "epoch": 1.8798730734360833, + "grad_norm": 0.398523401312176, + "learning_rate": 6.283145314689313e-05, + "loss": 0.9018, + "step": 4147 + }, + { + "epoch": 1.8803263825929284, + "grad_norm": 0.43831392720761364, + "learning_rate": 6.282105789702131e-05, + "loss": 0.8819, + "step": 4148 + }, + { + "epoch": 1.8807796917497734, + "grad_norm": 0.43299163163530446, + "learning_rate": 6.281066036155346e-05, + "loss": 0.884, + "step": 4149 + }, + { + "epoch": 1.8812330009066183, + "grad_norm": 0.46560587410786475, + "learning_rate": 6.280026054153094e-05, + "loss": 0.881, + "step": 4150 + }, + { + "epoch": 1.8816863100634633, + "grad_norm": 0.5837526950583559, + "learning_rate": 6.278985843799531e-05, + "loss": 0.8905, + "step": 4151 + }, + { + "epoch": 1.8821396192203084, + "grad_norm": 0.6177439838163441, + "learning_rate": 6.27794540519884e-05, + "loss": 0.8972, + "step": 4152 + }, + { + "epoch": 1.8825929283771532, + "grad_norm": 0.49614646628841147, + "learning_rate": 6.27690473845522e-05, + "loss": 0.8962, + "step": 4153 + }, + { + "epoch": 1.883046237533998, + "grad_norm": 0.3765039661768695, + "learning_rate": 6.275863843672899e-05, + "loss": 0.8727, + "step": 4154 + }, + { + "epoch": 1.8834995466908433, + "grad_norm": 0.3845067191840651, + "learning_rate": 6.274822720956125e-05, + "loss": 0.8825, + "step": 4155 + }, + { + "epoch": 1.8839528558476881, + "grad_norm": 0.4332445548073036, + "learning_rate": 6.273781370409172e-05, + "loss": 0.893, + "step": 4156 + }, + { + "epoch": 1.884406165004533, + "grad_norm": 0.5329190760778711, + "learning_rate": 6.272739792136331e-05, + "loss": 0.8901, + "step": 4157 + }, + { + "epoch": 1.8848594741613782, + "grad_norm": 0.6886289420698846, + "learning_rate": 6.27169798624192e-05, + "loss": 0.9041, + "step": 4158 + }, + { + "epoch": 1.885312783318223, + "grad_norm": 0.8082198737585714, + "learning_rate": 6.270655952830279e-05, + "loss": 0.8887, + "step": 4159 + }, + { + "epoch": 1.8857660924750679, + "grad_norm": 0.9434601235162842, + "learning_rate": 6.269613692005771e-05, + "loss": 0.9077, + "step": 4160 + }, + { + "epoch": 1.886219401631913, + "grad_norm": 1.030492716659545, + "learning_rate": 6.268571203872784e-05, + "loss": 0.8736, + "step": 4161 + }, + { + "epoch": 1.886672710788758, + "grad_norm": 0.9585947284791836, + "learning_rate": 6.267528488535721e-05, + "loss": 0.8848, + "step": 4162 + }, + { + "epoch": 1.8871260199456028, + "grad_norm": 0.9294775200152561, + "learning_rate": 6.266485546099017e-05, + "loss": 0.9126, + "step": 4163 + }, + { + "epoch": 1.8875793291024479, + "grad_norm": 0.8593137125795962, + "learning_rate": 6.265442376667124e-05, + "loss": 0.8898, + "step": 4164 + }, + { + "epoch": 1.888032638259293, + "grad_norm": 0.8332270135654987, + "learning_rate": 6.264398980344518e-05, + "loss": 0.895, + "step": 4165 + }, + { + "epoch": 1.8884859474161377, + "grad_norm": 0.8866029922853187, + "learning_rate": 6.2633553572357e-05, + "loss": 0.8819, + "step": 4166 + }, + { + "epoch": 1.8889392565729828, + "grad_norm": 0.9203849273986358, + "learning_rate": 6.262311507445191e-05, + "loss": 0.9137, + "step": 4167 + }, + { + "epoch": 1.8893925657298278, + "grad_norm": 0.748997669861131, + "learning_rate": 6.261267431077537e-05, + "loss": 0.9119, + "step": 4168 + }, + { + "epoch": 1.8898458748866727, + "grad_norm": 0.63560688651663, + "learning_rate": 6.260223128237302e-05, + "loss": 0.9064, + "step": 4169 + }, + { + "epoch": 1.8902991840435177, + "grad_norm": 0.6152824901534124, + "learning_rate": 6.259178599029078e-05, + "loss": 0.9011, + "step": 4170 + }, + { + "epoch": 1.8907524932003628, + "grad_norm": 0.7140844205364775, + "learning_rate": 6.25813384355748e-05, + "loss": 0.871, + "step": 4171 + }, + { + "epoch": 1.8912058023572076, + "grad_norm": 0.7270944417343485, + "learning_rate": 6.257088861927139e-05, + "loss": 0.873, + "step": 4172 + }, + { + "epoch": 1.8916591115140526, + "grad_norm": 0.5855639378814469, + "learning_rate": 6.256043654242716e-05, + "loss": 0.8876, + "step": 4173 + }, + { + "epoch": 1.8921124206708977, + "grad_norm": 0.5317384835387086, + "learning_rate": 6.254998220608891e-05, + "loss": 0.911, + "step": 4174 + }, + { + "epoch": 1.8925657298277425, + "grad_norm": 0.6507004649219627, + "learning_rate": 6.253952561130366e-05, + "loss": 0.8879, + "step": 4175 + }, + { + "epoch": 1.8930190389845873, + "grad_norm": 0.6599048308500733, + "learning_rate": 6.252906675911868e-05, + "loss": 0.8947, + "step": 4176 + }, + { + "epoch": 1.8934723481414326, + "grad_norm": 0.7503908320017902, + "learning_rate": 6.251860565058146e-05, + "loss": 0.885, + "step": 4177 + }, + { + "epoch": 1.8939256572982774, + "grad_norm": 0.8322102804263362, + "learning_rate": 6.250814228673969e-05, + "loss": 0.8987, + "step": 4178 + }, + { + "epoch": 1.8943789664551223, + "grad_norm": 0.8224214780524243, + "learning_rate": 6.249767666864132e-05, + "loss": 0.9099, + "step": 4179 + }, + { + "epoch": 1.8948322756119673, + "grad_norm": 0.7650630859008393, + "learning_rate": 6.248720879733452e-05, + "loss": 0.8991, + "step": 4180 + }, + { + "epoch": 1.8952855847688124, + "grad_norm": 0.7851325785906426, + "learning_rate": 6.247673867386767e-05, + "loss": 0.8754, + "step": 4181 + }, + { + "epoch": 1.8957388939256572, + "grad_norm": 0.7834666143782496, + "learning_rate": 6.246626629928938e-05, + "loss": 0.8827, + "step": 4182 + }, + { + "epoch": 1.8961922030825022, + "grad_norm": 0.8234970821669675, + "learning_rate": 6.245579167464848e-05, + "loss": 0.8752, + "step": 4183 + }, + { + "epoch": 1.8966455122393473, + "grad_norm": 0.7661353689848013, + "learning_rate": 6.244531480099409e-05, + "loss": 0.9094, + "step": 4184 + }, + { + "epoch": 1.8970988213961921, + "grad_norm": 0.6730115779965113, + "learning_rate": 6.243483567937541e-05, + "loss": 0.9202, + "step": 4185 + }, + { + "epoch": 1.8975521305530372, + "grad_norm": 0.5434158028316058, + "learning_rate": 6.242435431084202e-05, + "loss": 0.8924, + "step": 4186 + }, + { + "epoch": 1.8980054397098822, + "grad_norm": 0.4261984678422516, + "learning_rate": 6.241387069644366e-05, + "loss": 0.8626, + "step": 4187 + }, + { + "epoch": 1.898458748866727, + "grad_norm": 0.43787705953311673, + "learning_rate": 6.240338483723024e-05, + "loss": 0.8905, + "step": 4188 + }, + { + "epoch": 1.898912058023572, + "grad_norm": 0.46208921219168625, + "learning_rate": 6.239289673425202e-05, + "loss": 0.8969, + "step": 4189 + }, + { + "epoch": 1.8993653671804172, + "grad_norm": 0.46883229910730434, + "learning_rate": 6.238240638855936e-05, + "loss": 0.8888, + "step": 4190 + }, + { + "epoch": 1.899818676337262, + "grad_norm": 0.494551565410381, + "learning_rate": 6.237191380120292e-05, + "loss": 0.8963, + "step": 4191 + }, + { + "epoch": 1.900271985494107, + "grad_norm": 0.4086803674661086, + "learning_rate": 6.236141897323357e-05, + "loss": 0.8943, + "step": 4192 + }, + { + "epoch": 1.900725294650952, + "grad_norm": 0.3815528307662305, + "learning_rate": 6.23509219057024e-05, + "loss": 0.8979, + "step": 4193 + }, + { + "epoch": 1.901178603807797, + "grad_norm": 0.41735612443239656, + "learning_rate": 6.23404225996607e-05, + "loss": 0.8861, + "step": 4194 + }, + { + "epoch": 1.9016319129646417, + "grad_norm": 0.4702989006319909, + "learning_rate": 6.232992105616002e-05, + "loss": 0.881, + "step": 4195 + }, + { + "epoch": 1.902085222121487, + "grad_norm": 0.4907959850258766, + "learning_rate": 6.231941727625211e-05, + "loss": 0.8844, + "step": 4196 + }, + { + "epoch": 1.9025385312783318, + "grad_norm": 0.49557956459676966, + "learning_rate": 6.230891126098899e-05, + "loss": 0.8872, + "step": 4197 + }, + { + "epoch": 1.9029918404351767, + "grad_norm": 0.3907808297848098, + "learning_rate": 6.229840301142282e-05, + "loss": 0.878, + "step": 4198 + }, + { + "epoch": 1.9034451495920217, + "grad_norm": 0.3932589724162242, + "learning_rate": 6.228789252860607e-05, + "loss": 0.901, + "step": 4199 + }, + { + "epoch": 1.9038984587488668, + "grad_norm": 0.4450864409442323, + "learning_rate": 6.227737981359139e-05, + "loss": 0.8829, + "step": 4200 + }, + { + "epoch": 1.9043517679057116, + "grad_norm": 0.5187179840271785, + "learning_rate": 6.226686486743162e-05, + "loss": 0.883, + "step": 4201 + }, + { + "epoch": 1.9048050770625566, + "grad_norm": 0.7101165329861685, + "learning_rate": 6.22563476911799e-05, + "loss": 0.8771, + "step": 4202 + }, + { + "epoch": 1.9052583862194017, + "grad_norm": 0.9210282316246935, + "learning_rate": 6.224582828588955e-05, + "loss": 0.8782, + "step": 4203 + }, + { + "epoch": 1.9057116953762465, + "grad_norm": 1.0089247581137875, + "learning_rate": 6.223530665261413e-05, + "loss": 0.9046, + "step": 4204 + }, + { + "epoch": 1.9061650045330916, + "grad_norm": 1.117157083243214, + "learning_rate": 6.22247827924074e-05, + "loss": 0.8911, + "step": 4205 + }, + { + "epoch": 1.9066183136899366, + "grad_norm": 0.8106417434587577, + "learning_rate": 6.221425670632334e-05, + "loss": 0.9235, + "step": 4206 + }, + { + "epoch": 1.9070716228467814, + "grad_norm": 0.5645741649106086, + "learning_rate": 6.220372839541618e-05, + "loss": 0.8944, + "step": 4207 + }, + { + "epoch": 1.9075249320036265, + "grad_norm": 0.44934431989659546, + "learning_rate": 6.219319786074038e-05, + "loss": 0.8908, + "step": 4208 + }, + { + "epoch": 1.9079782411604715, + "grad_norm": 0.5115030856649944, + "learning_rate": 6.21826651033506e-05, + "loss": 0.9028, + "step": 4209 + }, + { + "epoch": 1.9084315503173164, + "grad_norm": 0.6118699726729472, + "learning_rate": 6.21721301243017e-05, + "loss": 0.8968, + "step": 4210 + }, + { + "epoch": 1.9088848594741614, + "grad_norm": 0.7139862763025705, + "learning_rate": 6.216159292464881e-05, + "loss": 0.8951, + "step": 4211 + }, + { + "epoch": 1.9093381686310065, + "grad_norm": 0.762435112433903, + "learning_rate": 6.215105350544726e-05, + "loss": 0.9202, + "step": 4212 + }, + { + "epoch": 1.9097914777878513, + "grad_norm": 0.8309807016657387, + "learning_rate": 6.21405118677526e-05, + "loss": 0.901, + "step": 4213 + }, + { + "epoch": 1.9102447869446961, + "grad_norm": 0.9241390629783984, + "learning_rate": 6.21299680126206e-05, + "loss": 0.8808, + "step": 4214 + }, + { + "epoch": 1.9106980961015414, + "grad_norm": 0.8677850520252168, + "learning_rate": 6.211942194110726e-05, + "loss": 0.8997, + "step": 4215 + }, + { + "epoch": 1.9111514052583862, + "grad_norm": 0.754971119554309, + "learning_rate": 6.210887365426881e-05, + "loss": 0.9019, + "step": 4216 + }, + { + "epoch": 1.911604714415231, + "grad_norm": 0.610379566895126, + "learning_rate": 6.20983231531617e-05, + "loss": 0.8829, + "step": 4217 + }, + { + "epoch": 1.912058023572076, + "grad_norm": 0.5797734008758453, + "learning_rate": 6.208777043884257e-05, + "loss": 0.8755, + "step": 4218 + }, + { + "epoch": 1.9125113327289212, + "grad_norm": 0.5391142619880548, + "learning_rate": 6.207721551236832e-05, + "loss": 0.8813, + "step": 4219 + }, + { + "epoch": 1.912964641885766, + "grad_norm": 0.5285568247382887, + "learning_rate": 6.206665837479606e-05, + "loss": 0.8815, + "step": 4220 + }, + { + "epoch": 1.913417951042611, + "grad_norm": 0.5879813087687527, + "learning_rate": 6.205609902718309e-05, + "loss": 0.9352, + "step": 4221 + }, + { + "epoch": 1.913871260199456, + "grad_norm": 0.6244821710446665, + "learning_rate": 6.204553747058699e-05, + "loss": 0.8966, + "step": 4222 + }, + { + "epoch": 1.914324569356301, + "grad_norm": 0.4929697504688915, + "learning_rate": 6.203497370606552e-05, + "loss": 0.9061, + "step": 4223 + }, + { + "epoch": 1.914777878513146, + "grad_norm": 0.38858812445006863, + "learning_rate": 6.202440773467667e-05, + "loss": 0.8831, + "step": 4224 + }, + { + "epoch": 1.915231187669991, + "grad_norm": 0.43935886854912526, + "learning_rate": 6.201383955747866e-05, + "loss": 0.8866, + "step": 4225 + }, + { + "epoch": 1.9156844968268358, + "grad_norm": 0.519688884581569, + "learning_rate": 6.200326917552992e-05, + "loss": 0.8717, + "step": 4226 + }, + { + "epoch": 1.9161378059836809, + "grad_norm": 0.5730348796289453, + "learning_rate": 6.199269658988912e-05, + "loss": 0.8964, + "step": 4227 + }, + { + "epoch": 1.916591115140526, + "grad_norm": 0.5451261287093784, + "learning_rate": 6.198212180161512e-05, + "loss": 0.8755, + "step": 4228 + }, + { + "epoch": 1.9170444242973708, + "grad_norm": 0.43997480091704416, + "learning_rate": 6.1971544811767e-05, + "loss": 0.8979, + "step": 4229 + }, + { + "epoch": 1.9174977334542158, + "grad_norm": 0.4583970602723826, + "learning_rate": 6.19609656214041e-05, + "loss": 0.8799, + "step": 4230 + }, + { + "epoch": 1.9179510426110609, + "grad_norm": 0.6090966381987081, + "learning_rate": 6.195038423158596e-05, + "loss": 0.882, + "step": 4231 + }, + { + "epoch": 1.9184043517679057, + "grad_norm": 0.7456466894632745, + "learning_rate": 6.193980064337232e-05, + "loss": 0.8787, + "step": 4232 + }, + { + "epoch": 1.9188576609247505, + "grad_norm": 0.7985269670056172, + "learning_rate": 6.192921485782317e-05, + "loss": 0.9163, + "step": 4233 + }, + { + "epoch": 1.9193109700815958, + "grad_norm": 0.7860100532664257, + "learning_rate": 6.19186268759987e-05, + "loss": 0.8835, + "step": 4234 + }, + { + "epoch": 1.9197642792384406, + "grad_norm": 0.8511662831708099, + "learning_rate": 6.190803669895932e-05, + "loss": 0.8867, + "step": 4235 + }, + { + "epoch": 1.9202175883952854, + "grad_norm": 0.9065250908434862, + "learning_rate": 6.18974443277657e-05, + "loss": 0.8833, + "step": 4236 + }, + { + "epoch": 1.9206708975521307, + "grad_norm": 0.9192455841588082, + "learning_rate": 6.188684976347866e-05, + "loss": 0.9109, + "step": 4237 + }, + { + "epoch": 1.9211242067089755, + "grad_norm": 0.8932796101073472, + "learning_rate": 6.187625300715929e-05, + "loss": 0.8898, + "step": 4238 + }, + { + "epoch": 1.9215775158658204, + "grad_norm": 0.8907145526040002, + "learning_rate": 6.18656540598689e-05, + "loss": 0.915, + "step": 4239 + }, + { + "epoch": 1.9220308250226654, + "grad_norm": 0.7688288842594142, + "learning_rate": 6.185505292266898e-05, + "loss": 0.8812, + "step": 4240 + }, + { + "epoch": 1.9224841341795105, + "grad_norm": 0.6152322424720795, + "learning_rate": 6.18444495966213e-05, + "loss": 0.8946, + "step": 4241 + }, + { + "epoch": 1.9229374433363553, + "grad_norm": 0.5241064429080163, + "learning_rate": 6.183384408278777e-05, + "loss": 0.9042, + "step": 4242 + }, + { + "epoch": 1.9233907524932004, + "grad_norm": 0.46394896121888046, + "learning_rate": 6.182323638223061e-05, + "loss": 0.8865, + "step": 4243 + }, + { + "epoch": 1.9238440616500454, + "grad_norm": 0.4332491344178371, + "learning_rate": 6.181262649601216e-05, + "loss": 0.8876, + "step": 4244 + }, + { + "epoch": 1.9242973708068902, + "grad_norm": 0.4321025460354482, + "learning_rate": 6.180201442519508e-05, + "loss": 0.8868, + "step": 4245 + }, + { + "epoch": 1.9247506799637353, + "grad_norm": 0.4541990926865616, + "learning_rate": 6.179140017084219e-05, + "loss": 0.8837, + "step": 4246 + }, + { + "epoch": 1.9252039891205803, + "grad_norm": 0.4196768094800348, + "learning_rate": 6.178078373401651e-05, + "loss": 0.8735, + "step": 4247 + }, + { + "epoch": 1.9256572982774252, + "grad_norm": 0.3668710470401487, + "learning_rate": 6.177016511578133e-05, + "loss": 0.9012, + "step": 4248 + }, + { + "epoch": 1.9261106074342702, + "grad_norm": 0.4167138737924348, + "learning_rate": 6.175954431720013e-05, + "loss": 0.891, + "step": 4249 + }, + { + "epoch": 1.9265639165911153, + "grad_norm": 0.4543816015944773, + "learning_rate": 6.174892133933663e-05, + "loss": 0.8825, + "step": 4250 + }, + { + "epoch": 1.92701722574796, + "grad_norm": 0.4538716692970924, + "learning_rate": 6.173829618325473e-05, + "loss": 0.9037, + "step": 4251 + }, + { + "epoch": 1.927470534904805, + "grad_norm": 0.36583079523660045, + "learning_rate": 6.172766885001857e-05, + "loss": 0.9003, + "step": 4252 + }, + { + "epoch": 1.9279238440616502, + "grad_norm": 0.41013768325084593, + "learning_rate": 6.171703934069253e-05, + "loss": 0.8928, + "step": 4253 + }, + { + "epoch": 1.928377153218495, + "grad_norm": 0.5253556854343688, + "learning_rate": 6.170640765634117e-05, + "loss": 0.8918, + "step": 4254 + }, + { + "epoch": 1.9288304623753398, + "grad_norm": 0.6185291190824481, + "learning_rate": 6.16957737980293e-05, + "loss": 0.9169, + "step": 4255 + }, + { + "epoch": 1.929283771532185, + "grad_norm": 0.7171671339456588, + "learning_rate": 6.16851377668219e-05, + "loss": 0.888, + "step": 4256 + }, + { + "epoch": 1.92973708068903, + "grad_norm": 0.731011155423579, + "learning_rate": 6.167449956378425e-05, + "loss": 0.8899, + "step": 4257 + }, + { + "epoch": 1.9301903898458748, + "grad_norm": 0.8035207532873982, + "learning_rate": 6.166385918998176e-05, + "loss": 0.8983, + "step": 4258 + }, + { + "epoch": 1.9306436990027198, + "grad_norm": 0.8580529674619894, + "learning_rate": 6.16532166464801e-05, + "loss": 0.9002, + "step": 4259 + }, + { + "epoch": 1.9310970081595649, + "grad_norm": 0.8741394044748001, + "learning_rate": 6.164257193434514e-05, + "loss": 0.9041, + "step": 4260 + }, + { + "epoch": 1.9315503173164097, + "grad_norm": 0.8678009011934474, + "learning_rate": 6.1631925054643e-05, + "loss": 0.8903, + "step": 4261 + }, + { + "epoch": 1.9320036264732547, + "grad_norm": 0.8963606588481672, + "learning_rate": 6.162127600844e-05, + "loss": 0.8857, + "step": 4262 + }, + { + "epoch": 1.9324569356300998, + "grad_norm": 0.9076901439935136, + "learning_rate": 6.161062479680266e-05, + "loss": 0.8634, + "step": 4263 + }, + { + "epoch": 1.9329102447869446, + "grad_norm": 0.9589739700886213, + "learning_rate": 6.159997142079774e-05, + "loss": 0.892, + "step": 4264 + }, + { + "epoch": 1.9333635539437897, + "grad_norm": 0.7675740676866123, + "learning_rate": 6.15893158814922e-05, + "loss": 0.8842, + "step": 4265 + }, + { + "epoch": 1.9338168631006347, + "grad_norm": 0.7165229454887396, + "learning_rate": 6.15786581799532e-05, + "loss": 0.873, + "step": 4266 + }, + { + "epoch": 1.9342701722574795, + "grad_norm": 0.6413095972986829, + "learning_rate": 6.15679983172482e-05, + "loss": 0.9065, + "step": 4267 + }, + { + "epoch": 1.9347234814143246, + "grad_norm": 0.5959507575614662, + "learning_rate": 6.155733629444477e-05, + "loss": 0.8781, + "step": 4268 + }, + { + "epoch": 1.9351767905711696, + "grad_norm": 0.5757730072599968, + "learning_rate": 6.154667211261075e-05, + "loss": 0.8805, + "step": 4269 + }, + { + "epoch": 1.9356300997280145, + "grad_norm": 0.5506610567390939, + "learning_rate": 6.15360057728142e-05, + "loss": 0.8763, + "step": 4270 + }, + { + "epoch": 1.9360834088848595, + "grad_norm": 0.5045933130357635, + "learning_rate": 6.152533727612338e-05, + "loss": 0.8995, + "step": 4271 + }, + { + "epoch": 1.9365367180417046, + "grad_norm": 0.43833407688861176, + "learning_rate": 6.151466662360675e-05, + "loss": 0.89, + "step": 4272 + }, + { + "epoch": 1.9369900271985494, + "grad_norm": 0.3984819308937653, + "learning_rate": 6.150399381633303e-05, + "loss": 0.886, + "step": 4273 + }, + { + "epoch": 1.9374433363553942, + "grad_norm": 0.3908058319224183, + "learning_rate": 6.149331885537115e-05, + "loss": 0.9102, + "step": 4274 + }, + { + "epoch": 1.9378966455122395, + "grad_norm": 0.30568774209697946, + "learning_rate": 6.14826417417902e-05, + "loss": 0.8777, + "step": 4275 + }, + { + "epoch": 1.9383499546690843, + "grad_norm": 0.38274764126330535, + "learning_rate": 6.147196247665956e-05, + "loss": 0.8949, + "step": 4276 + }, + { + "epoch": 1.9388032638259292, + "grad_norm": 0.42288933643246535, + "learning_rate": 6.146128106104877e-05, + "loss": 0.92, + "step": 4277 + }, + { + "epoch": 1.9392565729827742, + "grad_norm": 0.37369819576539826, + "learning_rate": 6.14505974960276e-05, + "loss": 0.9016, + "step": 4278 + }, + { + "epoch": 1.9397098821396193, + "grad_norm": 0.4772975526805561, + "learning_rate": 6.143991178266606e-05, + "loss": 0.8918, + "step": 4279 + }, + { + "epoch": 1.940163191296464, + "grad_norm": 0.6056403923211979, + "learning_rate": 6.142922392203433e-05, + "loss": 0.8858, + "step": 4280 + }, + { + "epoch": 1.9406165004533091, + "grad_norm": 0.6425484770118037, + "learning_rate": 6.141853391520286e-05, + "loss": 0.9016, + "step": 4281 + }, + { + "epoch": 1.9410698096101542, + "grad_norm": 0.6282528323041735, + "learning_rate": 6.140784176324226e-05, + "loss": 0.8852, + "step": 4282 + }, + { + "epoch": 1.941523118766999, + "grad_norm": 0.6627161550766829, + "learning_rate": 6.139714746722339e-05, + "loss": 0.8761, + "step": 4283 + }, + { + "epoch": 1.941976427923844, + "grad_norm": 0.6518880817874622, + "learning_rate": 6.138645102821732e-05, + "loss": 0.9157, + "step": 4284 + }, + { + "epoch": 1.9424297370806891, + "grad_norm": 0.5407958395608782, + "learning_rate": 6.137575244729532e-05, + "loss": 0.8815, + "step": 4285 + }, + { + "epoch": 1.942883046237534, + "grad_norm": 0.4812199562122479, + "learning_rate": 6.136505172552891e-05, + "loss": 0.8698, + "step": 4286 + }, + { + "epoch": 1.943336355394379, + "grad_norm": 0.468502137267378, + "learning_rate": 6.135434886398976e-05, + "loss": 0.904, + "step": 4287 + }, + { + "epoch": 1.943789664551224, + "grad_norm": 0.4057491381982324, + "learning_rate": 6.134364386374982e-05, + "loss": 0.9068, + "step": 4288 + }, + { + "epoch": 1.9442429737080689, + "grad_norm": 0.4490928839588027, + "learning_rate": 6.13329367258812e-05, + "loss": 0.9075, + "step": 4289 + }, + { + "epoch": 1.944696282864914, + "grad_norm": 0.5002788335352963, + "learning_rate": 6.132222745145629e-05, + "loss": 0.8932, + "step": 4290 + }, + { + "epoch": 1.945149592021759, + "grad_norm": 0.39113823697383554, + "learning_rate": 6.131151604154764e-05, + "loss": 0.8972, + "step": 4291 + }, + { + "epoch": 1.9456029011786038, + "grad_norm": 0.3742315931682835, + "learning_rate": 6.130080249722801e-05, + "loss": 0.8982, + "step": 4292 + }, + { + "epoch": 1.9460562103354486, + "grad_norm": 0.4253567428388246, + "learning_rate": 6.129008681957042e-05, + "loss": 0.8892, + "step": 4293 + }, + { + "epoch": 1.946509519492294, + "grad_norm": 0.541038769577248, + "learning_rate": 6.127936900964805e-05, + "loss": 0.9047, + "step": 4294 + }, + { + "epoch": 1.9469628286491387, + "grad_norm": 0.5184678848203109, + "learning_rate": 6.126864906853435e-05, + "loss": 0.9037, + "step": 4295 + }, + { + "epoch": 1.9474161378059835, + "grad_norm": 0.46337343194061537, + "learning_rate": 6.125792699730295e-05, + "loss": 0.8932, + "step": 4296 + }, + { + "epoch": 1.9478694469628286, + "grad_norm": 0.4822340390616215, + "learning_rate": 6.124720279702766e-05, + "loss": 0.8785, + "step": 4297 + }, + { + "epoch": 1.9483227561196736, + "grad_norm": 0.523358748970124, + "learning_rate": 6.123647646878257e-05, + "loss": 0.8937, + "step": 4298 + }, + { + "epoch": 1.9487760652765185, + "grad_norm": 0.6472220989973186, + "learning_rate": 6.122574801364195e-05, + "loss": 0.8972, + "step": 4299 + }, + { + "epoch": 1.9492293744333635, + "grad_norm": 0.7964114888844782, + "learning_rate": 6.121501743268029e-05, + "loss": 0.8984, + "step": 4300 + }, + { + "epoch": 1.9496826835902086, + "grad_norm": 0.9023237364327655, + "learning_rate": 6.120428472697228e-05, + "loss": 0.8904, + "step": 4301 + }, + { + "epoch": 1.9501359927470534, + "grad_norm": 0.9864911577695431, + "learning_rate": 6.119354989759283e-05, + "loss": 0.889, + "step": 4302 + }, + { + "epoch": 1.9505893019038985, + "grad_norm": 0.9994298592943544, + "learning_rate": 6.118281294561709e-05, + "loss": 0.8791, + "step": 4303 + }, + { + "epoch": 1.9510426110607435, + "grad_norm": 0.8935174897219039, + "learning_rate": 6.117207387212037e-05, + "loss": 0.9057, + "step": 4304 + }, + { + "epoch": 1.9514959202175883, + "grad_norm": 0.7789634876236665, + "learning_rate": 6.116133267817822e-05, + "loss": 0.9046, + "step": 4305 + }, + { + "epoch": 1.9519492293744334, + "grad_norm": 0.8086335673337979, + "learning_rate": 6.115058936486641e-05, + "loss": 0.8952, + "step": 4306 + }, + { + "epoch": 1.9524025385312784, + "grad_norm": 0.739189957777655, + "learning_rate": 6.113984393326092e-05, + "loss": 0.8984, + "step": 4307 + }, + { + "epoch": 1.9528558476881233, + "grad_norm": 0.5209563626200889, + "learning_rate": 6.112909638443795e-05, + "loss": 0.8878, + "step": 4308 + }, + { + "epoch": 1.9533091568449683, + "grad_norm": 0.39730948585516196, + "learning_rate": 6.111834671947386e-05, + "loss": 0.8991, + "step": 4309 + }, + { + "epoch": 1.9537624660018134, + "grad_norm": 0.5284957214339676, + "learning_rate": 6.110759493944528e-05, + "loss": 0.8777, + "step": 4310 + }, + { + "epoch": 1.9542157751586582, + "grad_norm": 0.6021405774055241, + "learning_rate": 6.109684104542902e-05, + "loss": 0.8954, + "step": 4311 + }, + { + "epoch": 1.954669084315503, + "grad_norm": 0.5793785787506259, + "learning_rate": 6.108608503850215e-05, + "loss": 0.8957, + "step": 4312 + }, + { + "epoch": 1.9551223934723483, + "grad_norm": 0.5801819438491267, + "learning_rate": 6.107532691974188e-05, + "loss": 0.8882, + "step": 4313 + }, + { + "epoch": 1.9555757026291931, + "grad_norm": 0.5602207481624712, + "learning_rate": 6.10645666902257e-05, + "loss": 0.8867, + "step": 4314 + }, + { + "epoch": 1.956029011786038, + "grad_norm": 0.5624375196923395, + "learning_rate": 6.105380435103124e-05, + "loss": 0.884, + "step": 4315 + }, + { + "epoch": 1.956482320942883, + "grad_norm": 0.5344124754639994, + "learning_rate": 6.10430399032364e-05, + "loss": 0.8887, + "step": 4316 + }, + { + "epoch": 1.956935630099728, + "grad_norm": 0.5755244449833603, + "learning_rate": 6.1032273347919265e-05, + "loss": 0.8902, + "step": 4317 + }, + { + "epoch": 1.9573889392565729, + "grad_norm": 0.7243443814110515, + "learning_rate": 6.1021504686158144e-05, + "loss": 0.8828, + "step": 4318 + }, + { + "epoch": 1.957842248413418, + "grad_norm": 0.9171033450072539, + "learning_rate": 6.101073391903155e-05, + "loss": 0.8989, + "step": 4319 + }, + { + "epoch": 1.958295557570263, + "grad_norm": 0.9691417315231818, + "learning_rate": 6.09999610476182e-05, + "loss": 0.9081, + "step": 4320 + }, + { + "epoch": 1.9587488667271078, + "grad_norm": 0.9149549555820755, + "learning_rate": 6.0989186072997046e-05, + "loss": 0.8896, + "step": 4321 + }, + { + "epoch": 1.9592021758839528, + "grad_norm": 0.8287504370187602, + "learning_rate": 6.0978408996247214e-05, + "loss": 0.8973, + "step": 4322 + }, + { + "epoch": 1.959655485040798, + "grad_norm": 0.8478279247024751, + "learning_rate": 6.096762981844806e-05, + "loss": 0.9147, + "step": 4323 + }, + { + "epoch": 1.9601087941976427, + "grad_norm": 0.8234002259546819, + "learning_rate": 6.095684854067918e-05, + "loss": 0.8915, + "step": 4324 + }, + { + "epoch": 1.9605621033544878, + "grad_norm": 0.7232125209730338, + "learning_rate": 6.09460651640203e-05, + "loss": 0.906, + "step": 4325 + }, + { + "epoch": 1.9610154125113328, + "grad_norm": 0.5416930816437728, + "learning_rate": 6.093527968955144e-05, + "loss": 0.8866, + "step": 4326 + }, + { + "epoch": 1.9614687216681777, + "grad_norm": 0.44836489380888045, + "learning_rate": 6.09244921183528e-05, + "loss": 0.9044, + "step": 4327 + }, + { + "epoch": 1.9619220308250227, + "grad_norm": 0.40526191343460977, + "learning_rate": 6.091370245150477e-05, + "loss": 0.9007, + "step": 4328 + }, + { + "epoch": 1.9623753399818678, + "grad_norm": 0.4490722379696141, + "learning_rate": 6.090291069008798e-05, + "loss": 0.8976, + "step": 4329 + }, + { + "epoch": 1.9628286491387126, + "grad_norm": 0.5305722806181034, + "learning_rate": 6.089211683518325e-05, + "loss": 0.8918, + "step": 4330 + }, + { + "epoch": 1.9632819582955574, + "grad_norm": 0.703041415054904, + "learning_rate": 6.088132088787161e-05, + "loss": 0.9025, + "step": 4331 + }, + { + "epoch": 1.9637352674524027, + "grad_norm": 0.9014554261444366, + "learning_rate": 6.087052284923433e-05, + "loss": 0.8925, + "step": 4332 + }, + { + "epoch": 1.9641885766092475, + "grad_norm": 1.0449274285748011, + "learning_rate": 6.085972272035284e-05, + "loss": 0.8991, + "step": 4333 + }, + { + "epoch": 1.9646418857660923, + "grad_norm": 0.9775773966024461, + "learning_rate": 6.084892050230883e-05, + "loss": 0.8835, + "step": 4334 + }, + { + "epoch": 1.9650951949229376, + "grad_norm": 0.8607541142786397, + "learning_rate": 6.083811619618414e-05, + "loss": 0.8899, + "step": 4335 + }, + { + "epoch": 1.9655485040797824, + "grad_norm": 0.6191432399866257, + "learning_rate": 6.082730980306088e-05, + "loss": 0.8744, + "step": 4336 + }, + { + "epoch": 1.9660018132366273, + "grad_norm": 0.3239165100473134, + "learning_rate": 6.081650132402132e-05, + "loss": 0.9067, + "step": 4337 + }, + { + "epoch": 1.9664551223934723, + "grad_norm": 0.38564787457188243, + "learning_rate": 6.080569076014799e-05, + "loss": 0.8803, + "step": 4338 + }, + { + "epoch": 1.9669084315503174, + "grad_norm": 0.6023621804125251, + "learning_rate": 6.0794878112523566e-05, + "loss": 0.872, + "step": 4339 + }, + { + "epoch": 1.9673617407071622, + "grad_norm": 0.6806279127792966, + "learning_rate": 6.0784063382231e-05, + "loss": 0.8854, + "step": 4340 + }, + { + "epoch": 1.9678150498640072, + "grad_norm": 0.7779568404396721, + "learning_rate": 6.07732465703534e-05, + "loss": 0.9002, + "step": 4341 + }, + { + "epoch": 1.9682683590208523, + "grad_norm": 0.8287067866421964, + "learning_rate": 6.076242767797409e-05, + "loss": 0.9057, + "step": 4342 + }, + { + "epoch": 1.9687216681776971, + "grad_norm": 0.840082503668762, + "learning_rate": 6.0751606706176646e-05, + "loss": 0.8911, + "step": 4343 + }, + { + "epoch": 1.9691749773345422, + "grad_norm": 0.7888255286854541, + "learning_rate": 6.0740783656044805e-05, + "loss": 0.9041, + "step": 4344 + }, + { + "epoch": 1.9696282864913872, + "grad_norm": 0.6637200700186041, + "learning_rate": 6.072995852866252e-05, + "loss": 0.8765, + "step": 4345 + }, + { + "epoch": 1.970081595648232, + "grad_norm": 0.4842927185634085, + "learning_rate": 6.071913132511395e-05, + "loss": 0.8936, + "step": 4346 + }, + { + "epoch": 1.970534904805077, + "grad_norm": 0.36107916459304384, + "learning_rate": 6.0708302046483496e-05, + "loss": 0.9073, + "step": 4347 + }, + { + "epoch": 1.9709882139619221, + "grad_norm": 0.4357845923023063, + "learning_rate": 6.069747069385573e-05, + "loss": 0.8816, + "step": 4348 + }, + { + "epoch": 1.971441523118767, + "grad_norm": 0.4317679725227376, + "learning_rate": 6.068663726831546e-05, + "loss": 0.8932, + "step": 4349 + }, + { + "epoch": 1.971894832275612, + "grad_norm": 0.5728366923199912, + "learning_rate": 6.0675801770947646e-05, + "loss": 0.8903, + "step": 4350 + }, + { + "epoch": 1.972348141432457, + "grad_norm": 0.7370377612545607, + "learning_rate": 6.0664964202837534e-05, + "loss": 0.8986, + "step": 4351 + }, + { + "epoch": 1.972801450589302, + "grad_norm": 0.7327223768138144, + "learning_rate": 6.065412456507052e-05, + "loss": 0.8919, + "step": 4352 + }, + { + "epoch": 1.9732547597461467, + "grad_norm": 0.631483668950669, + "learning_rate": 6.0643282858732234e-05, + "loss": 0.8807, + "step": 4353 + }, + { + "epoch": 1.973708068902992, + "grad_norm": 0.5329538238824669, + "learning_rate": 6.063243908490849e-05, + "loss": 0.9025, + "step": 4354 + }, + { + "epoch": 1.9741613780598368, + "grad_norm": 0.5299518406946434, + "learning_rate": 6.062159324468534e-05, + "loss": 0.8881, + "step": 4355 + }, + { + "epoch": 1.9746146872166817, + "grad_norm": 0.5628662526790137, + "learning_rate": 6.061074533914902e-05, + "loss": 0.8613, + "step": 4356 + }, + { + "epoch": 1.9750679963735267, + "grad_norm": 0.5194948531460198, + "learning_rate": 6.059989536938599e-05, + "loss": 0.8883, + "step": 4357 + }, + { + "epoch": 1.9755213055303718, + "grad_norm": 0.5089371081520179, + "learning_rate": 6.058904333648288e-05, + "loss": 0.8977, + "step": 4358 + }, + { + "epoch": 1.9759746146872166, + "grad_norm": 0.5876922503846671, + "learning_rate": 6.057818924152658e-05, + "loss": 0.8967, + "step": 4359 + }, + { + "epoch": 1.9764279238440616, + "grad_norm": 0.6814718776665586, + "learning_rate": 6.056733308560415e-05, + "loss": 0.8767, + "step": 4360 + }, + { + "epoch": 1.9768812330009067, + "grad_norm": 0.6848252663150104, + "learning_rate": 6.055647486980286e-05, + "loss": 0.8937, + "step": 4361 + }, + { + "epoch": 1.9773345421577515, + "grad_norm": 0.6516010710405016, + "learning_rate": 6.0545614595210205e-05, + "loss": 0.9056, + "step": 4362 + }, + { + "epoch": 1.9777878513145966, + "grad_norm": 0.5904775602325584, + "learning_rate": 6.053475226291385e-05, + "loss": 0.916, + "step": 4363 + }, + { + "epoch": 1.9782411604714416, + "grad_norm": 0.6257082925183015, + "learning_rate": 6.052388787400173e-05, + "loss": 0.9027, + "step": 4364 + }, + { + "epoch": 1.9786944696282864, + "grad_norm": 0.6519665930887986, + "learning_rate": 6.05130214295619e-05, + "loss": 0.8863, + "step": 4365 + }, + { + "epoch": 1.9791477787851315, + "grad_norm": 0.6409833796074436, + "learning_rate": 6.0502152930682696e-05, + "loss": 0.8875, + "step": 4366 + }, + { + "epoch": 1.9796010879419765, + "grad_norm": 0.6571912483287786, + "learning_rate": 6.049128237845262e-05, + "loss": 0.8843, + "step": 4367 + }, + { + "epoch": 1.9800543970988214, + "grad_norm": 0.684373355520645, + "learning_rate": 6.0480409773960386e-05, + "loss": 0.8912, + "step": 4368 + }, + { + "epoch": 1.9805077062556664, + "grad_norm": 0.7074984985386064, + "learning_rate": 6.046953511829493e-05, + "loss": 0.8873, + "step": 4369 + }, + { + "epoch": 1.9809610154125115, + "grad_norm": 0.6579066286838406, + "learning_rate": 6.045865841254536e-05, + "loss": 0.8801, + "step": 4370 + }, + { + "epoch": 1.9814143245693563, + "grad_norm": 0.5334619814751139, + "learning_rate": 6.044777965780103e-05, + "loss": 0.8811, + "step": 4371 + }, + { + "epoch": 1.9818676337262011, + "grad_norm": 0.47795245848301976, + "learning_rate": 6.0436898855151486e-05, + "loss": 0.8961, + "step": 4372 + }, + { + "epoch": 1.9823209428830464, + "grad_norm": 0.5781913519452522, + "learning_rate": 6.0426016005686455e-05, + "loss": 0.8863, + "step": 4373 + }, + { + "epoch": 1.9827742520398912, + "grad_norm": 0.5339137050202987, + "learning_rate": 6.041513111049588e-05, + "loss": 0.9075, + "step": 4374 + }, + { + "epoch": 1.983227561196736, + "grad_norm": 0.5195707578549125, + "learning_rate": 6.040424417066993e-05, + "loss": 0.8982, + "step": 4375 + }, + { + "epoch": 1.983680870353581, + "grad_norm": 0.47675457568619184, + "learning_rate": 6.039335518729895e-05, + "loss": 0.8806, + "step": 4376 + }, + { + "epoch": 1.9841341795104261, + "grad_norm": 0.5063223286377632, + "learning_rate": 6.0382464161473524e-05, + "loss": 0.902, + "step": 4377 + }, + { + "epoch": 1.984587488667271, + "grad_norm": 0.5468461629496405, + "learning_rate": 6.037157109428441e-05, + "loss": 0.8764, + "step": 4378 + }, + { + "epoch": 1.985040797824116, + "grad_norm": 0.3895310105963255, + "learning_rate": 6.0360675986822574e-05, + "loss": 0.8952, + "step": 4379 + }, + { + "epoch": 1.985494106980961, + "grad_norm": 0.40156797770779484, + "learning_rate": 6.03497788401792e-05, + "loss": 0.8834, + "step": 4380 + }, + { + "epoch": 1.985947416137806, + "grad_norm": 0.5177577821827013, + "learning_rate": 6.0338879655445664e-05, + "loss": 0.8814, + "step": 4381 + }, + { + "epoch": 1.986400725294651, + "grad_norm": 0.5460472322401857, + "learning_rate": 6.0327978433713574e-05, + "loss": 0.8779, + "step": 4382 + }, + { + "epoch": 1.986854034451496, + "grad_norm": 0.6678568574875138, + "learning_rate": 6.031707517607469e-05, + "loss": 0.9062, + "step": 4383 + }, + { + "epoch": 1.9873073436083408, + "grad_norm": 0.7610842535823695, + "learning_rate": 6.030616988362101e-05, + "loss": 0.9046, + "step": 4384 + }, + { + "epoch": 1.9877606527651859, + "grad_norm": 0.8968038739715484, + "learning_rate": 6.029526255744474e-05, + "loss": 0.8915, + "step": 4385 + }, + { + "epoch": 1.988213961922031, + "grad_norm": 0.9513851187766068, + "learning_rate": 6.028435319863827e-05, + "loss": 0.8856, + "step": 4386 + }, + { + "epoch": 1.9886672710788758, + "grad_norm": 0.9826235998098587, + "learning_rate": 6.0273441808294224e-05, + "loss": 0.8718, + "step": 4387 + }, + { + "epoch": 1.9891205802357208, + "grad_norm": 1.0368441666957495, + "learning_rate": 6.02625283875054e-05, + "loss": 0.8789, + "step": 4388 + }, + { + "epoch": 1.9895738893925659, + "grad_norm": 0.8479499367125268, + "learning_rate": 6.02516129373648e-05, + "loss": 0.8801, + "step": 4389 + }, + { + "epoch": 1.9900271985494107, + "grad_norm": 4.479442943840953, + "learning_rate": 6.0240695458965663e-05, + "loss": 0.9343, + "step": 4390 + }, + { + "epoch": 1.9904805077062555, + "grad_norm": 1.3043542923666624, + "learning_rate": 6.0229775953401375e-05, + "loss": 0.8863, + "step": 4391 + }, + { + "epoch": 1.9909338168631008, + "grad_norm": 1.4486937196014096, + "learning_rate": 6.021885442176556e-05, + "loss": 0.8928, + "step": 4392 + }, + { + "epoch": 1.9913871260199456, + "grad_norm": 0.6123235250524953, + "learning_rate": 6.0207930865152085e-05, + "loss": 0.8938, + "step": 4393 + }, + { + "epoch": 1.9918404351767904, + "grad_norm": 1.9088506495194186, + "learning_rate": 6.0197005284654916e-05, + "loss": 0.8906, + "step": 4394 + }, + { + "epoch": 1.9922937443336355, + "grad_norm": 0.8559287457673491, + "learning_rate": 6.018607768136832e-05, + "loss": 0.8874, + "step": 4395 + }, + { + "epoch": 1.9927470534904805, + "grad_norm": 2.337706410645413, + "learning_rate": 6.017514805638671e-05, + "loss": 0.9166, + "step": 4396 + }, + { + "epoch": 1.9932003626473254, + "grad_norm": 1.8922433633379754, + "learning_rate": 6.016421641080474e-05, + "loss": 0.8989, + "step": 4397 + }, + { + "epoch": 1.9936536718041704, + "grad_norm": 1.603650773808297, + "learning_rate": 6.0153282745717225e-05, + "loss": 0.9009, + "step": 4398 + }, + { + "epoch": 1.9941069809610155, + "grad_norm": 1.848741165073692, + "learning_rate": 6.014234706221922e-05, + "loss": 0.9149, + "step": 4399 + }, + { + "epoch": 1.9945602901178603, + "grad_norm": 1.4699498518903622, + "learning_rate": 6.0131409361405956e-05, + "loss": 0.9043, + "step": 4400 + }, + { + "epoch": 1.9950135992747053, + "grad_norm": 1.435666173090582, + "learning_rate": 6.012046964437289e-05, + "loss": 0.8911, + "step": 4401 + }, + { + "epoch": 1.9954669084315504, + "grad_norm": 1.2952290581290675, + "learning_rate": 6.010952791221565e-05, + "loss": 0.9041, + "step": 4402 + }, + { + "epoch": 1.9959202175883952, + "grad_norm": 1.3585897821350794, + "learning_rate": 6.009858416603008e-05, + "loss": 0.8762, + "step": 4403 + }, + { + "epoch": 1.9963735267452403, + "grad_norm": 1.0644695951903798, + "learning_rate": 6.008763840691224e-05, + "loss": 0.8829, + "step": 4404 + }, + { + "epoch": 1.9968268359020853, + "grad_norm": 1.5751656808472843, + "learning_rate": 6.007669063595838e-05, + "loss": 0.9139, + "step": 4405 + }, + { + "epoch": 1.9972801450589301, + "grad_norm": 0.991250865004389, + "learning_rate": 6.0065740854264955e-05, + "loss": 0.8996, + "step": 4406 + }, + { + "epoch": 1.9977334542157752, + "grad_norm": 1.8587232556057705, + "learning_rate": 6.00547890629286e-05, + "loss": 0.8834, + "step": 4407 + }, + { + "epoch": 1.9981867633726202, + "grad_norm": 1.4481550775776828, + "learning_rate": 6.004383526304619e-05, + "loss": 0.9087, + "step": 4408 + }, + { + "epoch": 1.998640072529465, + "grad_norm": 1.5981815406500834, + "learning_rate": 6.003287945571476e-05, + "loss": 0.8787, + "step": 4409 + }, + { + "epoch": 1.99909338168631, + "grad_norm": 1.2621367186773798, + "learning_rate": 6.0021921642031593e-05, + "loss": 0.8962, + "step": 4410 + }, + { + "epoch": 1.9995466908431552, + "grad_norm": 1.422988961298709, + "learning_rate": 6.001096182309412e-05, + "loss": 0.859, + "step": 4411 + }, + { + "epoch": 2.0, + "grad_norm": 1.0579517603299193, + "learning_rate": 6.000000000000001e-05, + "loss": 0.9021, + "step": 4412 + }, + { + "epoch": 2.000453309156845, + "grad_norm": 1.3693503507380156, + "learning_rate": 5.998903617384712e-05, + "loss": 0.8959, + "step": 4413 + }, + { + "epoch": 2.00090661831369, + "grad_norm": 0.8828831754594394, + "learning_rate": 5.9978070345733505e-05, + "loss": 0.8846, + "step": 4414 + }, + { + "epoch": 2.001359927470535, + "grad_norm": 1.3912387595878701, + "learning_rate": 5.996710251675744e-05, + "loss": 0.8893, + "step": 4415 + }, + { + "epoch": 2.0018132366273798, + "grad_norm": 0.8915105445974222, + "learning_rate": 5.995613268801737e-05, + "loss": 0.9044, + "step": 4416 + }, + { + "epoch": 2.002266545784225, + "grad_norm": 1.3371715963272475, + "learning_rate": 5.994516086061197e-05, + "loss": 0.885, + "step": 4417 + }, + { + "epoch": 2.00271985494107, + "grad_norm": 0.966143456009693, + "learning_rate": 5.9934187035640074e-05, + "loss": 0.8871, + "step": 4418 + }, + { + "epoch": 2.0031731640979147, + "grad_norm": 1.4217342810261684, + "learning_rate": 5.992321121420079e-05, + "loss": 0.8639, + "step": 4419 + }, + { + "epoch": 2.00362647325476, + "grad_norm": 1.0808071833658555, + "learning_rate": 5.991223339739332e-05, + "loss": 0.8754, + "step": 4420 + }, + { + "epoch": 2.004079782411605, + "grad_norm": 1.292437578602291, + "learning_rate": 5.9901253586317175e-05, + "loss": 0.8632, + "step": 4421 + }, + { + "epoch": 2.0045330915684496, + "grad_norm": 1.1796723571980072, + "learning_rate": 5.989027178207199e-05, + "loss": 0.864, + "step": 4422 + }, + { + "epoch": 2.0049864007252944, + "grad_norm": 1.0903977369284323, + "learning_rate": 5.9879287985757624e-05, + "loss": 0.8658, + "step": 4423 + }, + { + "epoch": 2.0054397098821397, + "grad_norm": 1.087617383554328, + "learning_rate": 5.986830219847414e-05, + "loss": 0.8636, + "step": 4424 + }, + { + "epoch": 2.0058930190389845, + "grad_norm": 0.8642163672687267, + "learning_rate": 5.98573144213218e-05, + "loss": 0.8799, + "step": 4425 + }, + { + "epoch": 2.0063463281958294, + "grad_norm": 0.8183342862442501, + "learning_rate": 5.984632465540107e-05, + "loss": 0.8581, + "step": 4426 + }, + { + "epoch": 2.0067996373526746, + "grad_norm": 0.7812570514830235, + "learning_rate": 5.983533290181259e-05, + "loss": 0.8925, + "step": 4427 + }, + { + "epoch": 2.0072529465095195, + "grad_norm": 0.8043295712983869, + "learning_rate": 5.9824339161657216e-05, + "loss": 0.8847, + "step": 4428 + }, + { + "epoch": 2.0077062556663643, + "grad_norm": 0.7235935773280668, + "learning_rate": 5.981334343603603e-05, + "loss": 0.8665, + "step": 4429 + }, + { + "epoch": 2.0081595648232096, + "grad_norm": 0.8315540172254899, + "learning_rate": 5.980234572605026e-05, + "loss": 0.8974, + "step": 4430 + }, + { + "epoch": 2.0086128739800544, + "grad_norm": 0.7302624377976467, + "learning_rate": 5.9791346032801375e-05, + "loss": 0.8799, + "step": 4431 + }, + { + "epoch": 2.009066183136899, + "grad_norm": 0.5946408202164674, + "learning_rate": 5.978034435739101e-05, + "loss": 0.852, + "step": 4432 + }, + { + "epoch": 2.0095194922937445, + "grad_norm": 0.7630621354511823, + "learning_rate": 5.976934070092103e-05, + "loss": 0.8832, + "step": 4433 + }, + { + "epoch": 2.0099728014505893, + "grad_norm": 0.6691143664065395, + "learning_rate": 5.9758335064493484e-05, + "loss": 0.8737, + "step": 4434 + }, + { + "epoch": 2.010426110607434, + "grad_norm": 0.585357783431062, + "learning_rate": 5.97473274492106e-05, + "loss": 0.8762, + "step": 4435 + }, + { + "epoch": 2.0108794197642794, + "grad_norm": 0.61182036773268, + "learning_rate": 5.9736317856174844e-05, + "loss": 0.8713, + "step": 4436 + }, + { + "epoch": 2.0113327289211242, + "grad_norm": 0.4714387909412953, + "learning_rate": 5.972530628648884e-05, + "loss": 0.8585, + "step": 4437 + }, + { + "epoch": 2.011786038077969, + "grad_norm": 0.4888272816253385, + "learning_rate": 5.971429274125545e-05, + "loss": 0.8555, + "step": 4438 + }, + { + "epoch": 2.0122393472348143, + "grad_norm": 0.4310515494877047, + "learning_rate": 5.97032772215777e-05, + "loss": 0.8752, + "step": 4439 + }, + { + "epoch": 2.012692656391659, + "grad_norm": 0.5748180008778407, + "learning_rate": 5.9692259728558834e-05, + "loss": 0.872, + "step": 4440 + }, + { + "epoch": 2.013145965548504, + "grad_norm": 0.5335792435495892, + "learning_rate": 5.9681240263302275e-05, + "loss": 0.8985, + "step": 4441 + }, + { + "epoch": 2.013599274705349, + "grad_norm": 0.4553141987300091, + "learning_rate": 5.967021882691166e-05, + "loss": 0.8668, + "step": 4442 + }, + { + "epoch": 2.014052583862194, + "grad_norm": 0.35294536290185724, + "learning_rate": 5.965919542049083e-05, + "loss": 0.8832, + "step": 4443 + }, + { + "epoch": 2.014505893019039, + "grad_norm": 0.4033452405015096, + "learning_rate": 5.964817004514379e-05, + "loss": 0.8698, + "step": 4444 + }, + { + "epoch": 2.0149592021758838, + "grad_norm": 0.4946055258597081, + "learning_rate": 5.9637142701974774e-05, + "loss": 0.8668, + "step": 4445 + }, + { + "epoch": 2.015412511332729, + "grad_norm": 0.561570902098644, + "learning_rate": 5.9626113392088214e-05, + "loss": 0.8994, + "step": 4446 + }, + { + "epoch": 2.015865820489574, + "grad_norm": 0.5382797939207867, + "learning_rate": 5.9615082116588714e-05, + "loss": 0.8791, + "step": 4447 + }, + { + "epoch": 2.0163191296464187, + "grad_norm": 0.42622687977317536, + "learning_rate": 5.96040488765811e-05, + "loss": 0.8857, + "step": 4448 + }, + { + "epoch": 2.016772438803264, + "grad_norm": 0.35887571588245143, + "learning_rate": 5.959301367317037e-05, + "loss": 0.8666, + "step": 4449 + }, + { + "epoch": 2.017225747960109, + "grad_norm": 0.3617734286220932, + "learning_rate": 5.958197650746173e-05, + "loss": 0.872, + "step": 4450 + }, + { + "epoch": 2.0176790571169536, + "grad_norm": 0.511402810323289, + "learning_rate": 5.95709373805606e-05, + "loss": 0.8583, + "step": 4451 + }, + { + "epoch": 2.018132366273799, + "grad_norm": 0.5827522234559757, + "learning_rate": 5.955989629357256e-05, + "loss": 0.8885, + "step": 4452 + }, + { + "epoch": 2.0185856754306437, + "grad_norm": 0.530827789283048, + "learning_rate": 5.954885324760344e-05, + "loss": 0.871, + "step": 4453 + }, + { + "epoch": 2.0190389845874885, + "grad_norm": 0.41666005056817096, + "learning_rate": 5.95378082437592e-05, + "loss": 0.876, + "step": 4454 + }, + { + "epoch": 2.019492293744334, + "grad_norm": 0.3727834656143721, + "learning_rate": 5.952676128314605e-05, + "loss": 0.8761, + "step": 4455 + }, + { + "epoch": 2.0199456029011786, + "grad_norm": 0.4116798078709951, + "learning_rate": 5.951571236687036e-05, + "loss": 0.8677, + "step": 4456 + }, + { + "epoch": 2.0203989120580235, + "grad_norm": 0.41371495894507987, + "learning_rate": 5.9504661496038736e-05, + "loss": 0.8715, + "step": 4457 + }, + { + "epoch": 2.0208522212148687, + "grad_norm": 0.4358248234375833, + "learning_rate": 5.9493608671757935e-05, + "loss": 0.8833, + "step": 4458 + }, + { + "epoch": 2.0213055303717136, + "grad_norm": 0.42466589843191915, + "learning_rate": 5.9482553895134935e-05, + "loss": 0.8752, + "step": 4459 + }, + { + "epoch": 2.0217588395285584, + "grad_norm": 0.38756646913486437, + "learning_rate": 5.9471497167276905e-05, + "loss": 0.8747, + "step": 4460 + }, + { + "epoch": 2.0222121486854037, + "grad_norm": 0.44194452611547214, + "learning_rate": 5.9460438489291206e-05, + "loss": 0.8779, + "step": 4461 + }, + { + "epoch": 2.0226654578422485, + "grad_norm": 0.4579894239854562, + "learning_rate": 5.94493778622854e-05, + "loss": 0.881, + "step": 4462 + }, + { + "epoch": 2.0231187669990933, + "grad_norm": 0.4158393656748826, + "learning_rate": 5.9438315287367234e-05, + "loss": 0.8745, + "step": 4463 + }, + { + "epoch": 2.023572076155938, + "grad_norm": 0.39296016585011584, + "learning_rate": 5.942725076564468e-05, + "loss": 0.9005, + "step": 4464 + }, + { + "epoch": 2.0240253853127834, + "grad_norm": 0.41853273507182476, + "learning_rate": 5.941618429822585e-05, + "loss": 0.8694, + "step": 4465 + }, + { + "epoch": 2.0244786944696282, + "grad_norm": 0.4928133618014884, + "learning_rate": 5.940511588621911e-05, + "loss": 0.8762, + "step": 4466 + }, + { + "epoch": 2.024932003626473, + "grad_norm": 0.5410015683251687, + "learning_rate": 5.939404553073297e-05, + "loss": 0.8739, + "step": 4467 + }, + { + "epoch": 2.0253853127833183, + "grad_norm": 0.5260626023718433, + "learning_rate": 5.938297323287618e-05, + "loss": 0.8718, + "step": 4468 + }, + { + "epoch": 2.025838621940163, + "grad_norm": 0.346935712879825, + "learning_rate": 5.937189899375766e-05, + "loss": 0.8768, + "step": 4469 + }, + { + "epoch": 2.026291931097008, + "grad_norm": 0.3165950763072655, + "learning_rate": 5.936082281448652e-05, + "loss": 0.8653, + "step": 4470 + }, + { + "epoch": 2.0267452402538533, + "grad_norm": 0.46830031206037454, + "learning_rate": 5.9349744696172075e-05, + "loss": 0.8905, + "step": 4471 + }, + { + "epoch": 2.027198549410698, + "grad_norm": 0.44010011454346676, + "learning_rate": 5.933866463992383e-05, + "loss": 0.8709, + "step": 4472 + }, + { + "epoch": 2.027651858567543, + "grad_norm": 0.38155558889248486, + "learning_rate": 5.932758264685148e-05, + "loss": 0.8494, + "step": 4473 + }, + { + "epoch": 2.028105167724388, + "grad_norm": 0.41948133521653275, + "learning_rate": 5.9316498718064925e-05, + "loss": 0.8879, + "step": 4474 + }, + { + "epoch": 2.028558476881233, + "grad_norm": 0.4553330394118193, + "learning_rate": 5.930541285467427e-05, + "loss": 0.8626, + "step": 4475 + }, + { + "epoch": 2.029011786038078, + "grad_norm": 0.3168597042113871, + "learning_rate": 5.9294325057789766e-05, + "loss": 0.8602, + "step": 4476 + }, + { + "epoch": 2.029465095194923, + "grad_norm": 0.3315163937698171, + "learning_rate": 5.92832353285219e-05, + "loss": 0.8734, + "step": 4477 + }, + { + "epoch": 2.029918404351768, + "grad_norm": 0.38138125461256345, + "learning_rate": 5.927214366798136e-05, + "loss": 0.8968, + "step": 4478 + }, + { + "epoch": 2.030371713508613, + "grad_norm": 0.4901611012766882, + "learning_rate": 5.9261050077278974e-05, + "loss": 0.8633, + "step": 4479 + }, + { + "epoch": 2.030825022665458, + "grad_norm": 0.5627471632609867, + "learning_rate": 5.924995455752582e-05, + "loss": 0.856, + "step": 4480 + }, + { + "epoch": 2.031278331822303, + "grad_norm": 0.620028015855765, + "learning_rate": 5.9238857109833145e-05, + "loss": 0.851, + "step": 4481 + }, + { + "epoch": 2.0317316409791477, + "grad_norm": 0.6008454023799015, + "learning_rate": 5.922775773531239e-05, + "loss": 0.8945, + "step": 4482 + }, + { + "epoch": 2.0321849501359925, + "grad_norm": 0.5085172928748548, + "learning_rate": 5.9216656435075185e-05, + "loss": 0.9039, + "step": 4483 + }, + { + "epoch": 2.032638259292838, + "grad_norm": 0.4428819720239636, + "learning_rate": 5.920555321023336e-05, + "loss": 0.8826, + "step": 4484 + }, + { + "epoch": 2.0330915684496826, + "grad_norm": 0.4339254649445022, + "learning_rate": 5.919444806189895e-05, + "loss": 0.8829, + "step": 4485 + }, + { + "epoch": 2.0335448776065275, + "grad_norm": 0.48909281783407094, + "learning_rate": 5.9183340991184134e-05, + "loss": 0.8844, + "step": 4486 + }, + { + "epoch": 2.0339981867633727, + "grad_norm": 0.6278517793850551, + "learning_rate": 5.917223199920134e-05, + "loss": 0.8864, + "step": 4487 + }, + { + "epoch": 2.0344514959202176, + "grad_norm": 0.41668848176498907, + "learning_rate": 5.916112108706319e-05, + "loss": 0.8727, + "step": 4488 + }, + { + "epoch": 2.0349048050770624, + "grad_norm": 0.34233275945803454, + "learning_rate": 5.915000825588243e-05, + "loss": 0.8899, + "step": 4489 + }, + { + "epoch": 2.0353581142339077, + "grad_norm": 0.3126256014391702, + "learning_rate": 5.913889350677207e-05, + "loss": 0.8679, + "step": 4490 + }, + { + "epoch": 2.0358114233907525, + "grad_norm": 0.318959347516722, + "learning_rate": 5.9127776840845284e-05, + "loss": 0.876, + "step": 4491 + }, + { + "epoch": 2.0362647325475973, + "grad_norm": 0.3008174222198692, + "learning_rate": 5.9116658259215414e-05, + "loss": 0.8501, + "step": 4492 + }, + { + "epoch": 2.0367180417044426, + "grad_norm": 0.3225087480982042, + "learning_rate": 5.910553776299605e-05, + "loss": 0.8736, + "step": 4493 + }, + { + "epoch": 2.0371713508612874, + "grad_norm": 0.3727342053793816, + "learning_rate": 5.9094415353300914e-05, + "loss": 0.8937, + "step": 4494 + }, + { + "epoch": 2.0376246600181322, + "grad_norm": 0.4665973772767037, + "learning_rate": 5.908329103124397e-05, + "loss": 0.8741, + "step": 4495 + }, + { + "epoch": 2.0380779691749775, + "grad_norm": 0.4551070652251744, + "learning_rate": 5.9072164797939346e-05, + "loss": 0.8715, + "step": 4496 + }, + { + "epoch": 2.0385312783318223, + "grad_norm": 0.44583425885419264, + "learning_rate": 5.9061036654501384e-05, + "loss": 0.8852, + "step": 4497 + }, + { + "epoch": 2.038984587488667, + "grad_norm": 0.3586962098261952, + "learning_rate": 5.9049906602044546e-05, + "loss": 0.8886, + "step": 4498 + }, + { + "epoch": 2.0394378966455124, + "grad_norm": 0.2851000208853214, + "learning_rate": 5.9038774641683596e-05, + "loss": 0.8712, + "step": 4499 + }, + { + "epoch": 2.0398912058023573, + "grad_norm": 0.28402188052151345, + "learning_rate": 5.902764077453341e-05, + "loss": 0.8754, + "step": 4500 + }, + { + "epoch": 2.040344514959202, + "grad_norm": 0.41447315822263037, + "learning_rate": 5.901650500170908e-05, + "loss": 0.8782, + "step": 4501 + }, + { + "epoch": 2.040797824116047, + "grad_norm": 0.5275540876483987, + "learning_rate": 5.900536732432588e-05, + "loss": 0.8809, + "step": 4502 + }, + { + "epoch": 2.041251133272892, + "grad_norm": 0.5930230498546067, + "learning_rate": 5.899422774349929e-05, + "loss": 0.8755, + "step": 4503 + }, + { + "epoch": 2.041704442429737, + "grad_norm": 0.6592702871977485, + "learning_rate": 5.898308626034498e-05, + "loss": 0.8702, + "step": 4504 + }, + { + "epoch": 2.042157751586582, + "grad_norm": 0.6660992728523727, + "learning_rate": 5.897194287597877e-05, + "loss": 0.8868, + "step": 4505 + }, + { + "epoch": 2.042611060743427, + "grad_norm": 0.6563698092532015, + "learning_rate": 5.8960797591516736e-05, + "loss": 0.8779, + "step": 4506 + }, + { + "epoch": 2.043064369900272, + "grad_norm": 0.566987573178707, + "learning_rate": 5.8949650408075096e-05, + "loss": 0.8832, + "step": 4507 + }, + { + "epoch": 2.043517679057117, + "grad_norm": 0.5160006553631354, + "learning_rate": 5.8938501326770276e-05, + "loss": 0.8711, + "step": 4508 + }, + { + "epoch": 2.043970988213962, + "grad_norm": 0.4475010487591147, + "learning_rate": 5.892735034871889e-05, + "loss": 0.8772, + "step": 4509 + }, + { + "epoch": 2.044424297370807, + "grad_norm": 0.32709976595335793, + "learning_rate": 5.8916197475037736e-05, + "loss": 0.8884, + "step": 4510 + }, + { + "epoch": 2.0448776065276517, + "grad_norm": 0.304998318082167, + "learning_rate": 5.890504270684381e-05, + "loss": 0.8757, + "step": 4511 + }, + { + "epoch": 2.045330915684497, + "grad_norm": 0.323451296570382, + "learning_rate": 5.88938860452543e-05, + "loss": 0.8992, + "step": 4512 + }, + { + "epoch": 2.045784224841342, + "grad_norm": 0.31718422595184015, + "learning_rate": 5.888272749138657e-05, + "loss": 0.8556, + "step": 4513 + }, + { + "epoch": 2.0462375339981866, + "grad_norm": 0.40201172582155775, + "learning_rate": 5.88715670463582e-05, + "loss": 0.8986, + "step": 4514 + }, + { + "epoch": 2.046690843155032, + "grad_norm": 0.4448065920248286, + "learning_rate": 5.8860404711286906e-05, + "loss": 0.8737, + "step": 4515 + }, + { + "epoch": 2.0471441523118767, + "grad_norm": 0.4557609138938328, + "learning_rate": 5.884924048729066e-05, + "loss": 0.8832, + "step": 4516 + }, + { + "epoch": 2.0475974614687216, + "grad_norm": 0.5649482858837344, + "learning_rate": 5.8838074375487595e-05, + "loss": 0.8755, + "step": 4517 + }, + { + "epoch": 2.048050770625567, + "grad_norm": 0.6288555815903284, + "learning_rate": 5.8826906376996e-05, + "loss": 0.8758, + "step": 4518 + }, + { + "epoch": 2.0485040797824117, + "grad_norm": 0.6355960053124403, + "learning_rate": 5.881573649293442e-05, + "loss": 0.8774, + "step": 4519 + }, + { + "epoch": 2.0489573889392565, + "grad_norm": 0.5451789924822715, + "learning_rate": 5.880456472442151e-05, + "loss": 0.8719, + "step": 4520 + }, + { + "epoch": 2.0494106980961013, + "grad_norm": 0.4695490292029792, + "learning_rate": 5.879339107257619e-05, + "loss": 0.8543, + "step": 4521 + }, + { + "epoch": 2.0498640072529466, + "grad_norm": 0.3985693126750887, + "learning_rate": 5.8782215538517516e-05, + "loss": 0.901, + "step": 4522 + }, + { + "epoch": 2.0503173164097914, + "grad_norm": 0.48176067179448256, + "learning_rate": 5.877103812336476e-05, + "loss": 0.8675, + "step": 4523 + }, + { + "epoch": 2.0507706255666363, + "grad_norm": 0.3725372458333377, + "learning_rate": 5.8759858828237366e-05, + "loss": 0.8896, + "step": 4524 + }, + { + "epoch": 2.0512239347234815, + "grad_norm": 0.3102898357844386, + "learning_rate": 5.8748677654254976e-05, + "loss": 0.8862, + "step": 4525 + }, + { + "epoch": 2.0516772438803264, + "grad_norm": 0.30308879746017403, + "learning_rate": 5.8737494602537424e-05, + "loss": 0.8711, + "step": 4526 + }, + { + "epoch": 2.052130553037171, + "grad_norm": 0.3502308720026267, + "learning_rate": 5.87263096742047e-05, + "loss": 0.8519, + "step": 4527 + }, + { + "epoch": 2.0525838621940165, + "grad_norm": 0.4163613137660871, + "learning_rate": 5.871512287037704e-05, + "loss": 0.8768, + "step": 4528 + }, + { + "epoch": 2.0530371713508613, + "grad_norm": 0.458452895508648, + "learning_rate": 5.870393419217483e-05, + "loss": 0.8873, + "step": 4529 + }, + { + "epoch": 2.053490480507706, + "grad_norm": 0.5357598546782473, + "learning_rate": 5.869274364071863e-05, + "loss": 0.866, + "step": 4530 + }, + { + "epoch": 2.0539437896645514, + "grad_norm": 0.5779117525894155, + "learning_rate": 5.8681551217129215e-05, + "loss": 0.8579, + "step": 4531 + }, + { + "epoch": 2.054397098821396, + "grad_norm": 0.5900039415773937, + "learning_rate": 5.867035692252756e-05, + "loss": 0.8805, + "step": 4532 + }, + { + "epoch": 2.054850407978241, + "grad_norm": 0.5554949210045693, + "learning_rate": 5.865916075803476e-05, + "loss": 0.8751, + "step": 4533 + }, + { + "epoch": 2.0553037171350863, + "grad_norm": 0.5418874602314095, + "learning_rate": 5.864796272477219e-05, + "loss": 0.883, + "step": 4534 + }, + { + "epoch": 2.055757026291931, + "grad_norm": 0.47730641272836577, + "learning_rate": 5.863676282386134e-05, + "loss": 0.8667, + "step": 4535 + }, + { + "epoch": 2.056210335448776, + "grad_norm": 0.39539587062452264, + "learning_rate": 5.862556105642394e-05, + "loss": 0.8763, + "step": 4536 + }, + { + "epoch": 2.0566636446056212, + "grad_norm": 0.3475106432553535, + "learning_rate": 5.8614357423581846e-05, + "loss": 0.9027, + "step": 4537 + }, + { + "epoch": 2.057116953762466, + "grad_norm": 0.3489274705736764, + "learning_rate": 5.860315192645714e-05, + "loss": 0.8864, + "step": 4538 + }, + { + "epoch": 2.057570262919311, + "grad_norm": 0.45610203095586327, + "learning_rate": 5.859194456617211e-05, + "loss": 0.8716, + "step": 4539 + }, + { + "epoch": 2.0580235720761557, + "grad_norm": 0.4847082788744679, + "learning_rate": 5.858073534384917e-05, + "loss": 0.8822, + "step": 4540 + }, + { + "epoch": 2.058476881233001, + "grad_norm": 0.5350081328670659, + "learning_rate": 5.856952426061099e-05, + "loss": 0.8854, + "step": 4541 + }, + { + "epoch": 2.058930190389846, + "grad_norm": 0.5486716458468374, + "learning_rate": 5.855831131758039e-05, + "loss": 0.8608, + "step": 4542 + }, + { + "epoch": 2.0593834995466906, + "grad_norm": 0.5801203772822727, + "learning_rate": 5.8547096515880364e-05, + "loss": 0.8826, + "step": 4543 + }, + { + "epoch": 2.059836808703536, + "grad_norm": 0.5841033495289709, + "learning_rate": 5.85358798566341e-05, + "loss": 0.867, + "step": 4544 + }, + { + "epoch": 2.0602901178603807, + "grad_norm": 0.5792692586404584, + "learning_rate": 5.8524661340965014e-05, + "loss": 0.8786, + "step": 4545 + }, + { + "epoch": 2.0607434270172256, + "grad_norm": 0.5488692148355424, + "learning_rate": 5.851344096999664e-05, + "loss": 0.886, + "step": 4546 + }, + { + "epoch": 2.061196736174071, + "grad_norm": 0.497182685645821, + "learning_rate": 5.8502218744852754e-05, + "loss": 0.8873, + "step": 4547 + }, + { + "epoch": 2.0616500453309157, + "grad_norm": 0.42427438262862494, + "learning_rate": 5.849099466665728e-05, + "loss": 0.8835, + "step": 4548 + }, + { + "epoch": 2.0621033544877605, + "grad_norm": 0.39151906444581347, + "learning_rate": 5.847976873653433e-05, + "loss": 0.8772, + "step": 4549 + }, + { + "epoch": 2.0625566636446058, + "grad_norm": 0.3814635855210955, + "learning_rate": 5.846854095560824e-05, + "loss": 0.8766, + "step": 4550 + }, + { + "epoch": 2.0630099728014506, + "grad_norm": 0.4476564703672927, + "learning_rate": 5.8457311325003496e-05, + "loss": 0.8735, + "step": 4551 + }, + { + "epoch": 2.0634632819582954, + "grad_norm": 0.5167545301764351, + "learning_rate": 5.8446079845844775e-05, + "loss": 0.8775, + "step": 4552 + }, + { + "epoch": 2.0639165911151407, + "grad_norm": 0.5448111244516555, + "learning_rate": 5.8434846519256944e-05, + "loss": 0.8711, + "step": 4553 + }, + { + "epoch": 2.0643699002719855, + "grad_norm": 0.6327374305619716, + "learning_rate": 5.842361134636505e-05, + "loss": 0.8804, + "step": 4554 + }, + { + "epoch": 2.0648232094288304, + "grad_norm": 0.7069176773001967, + "learning_rate": 5.841237432829436e-05, + "loss": 0.8935, + "step": 4555 + }, + { + "epoch": 2.0652765185856756, + "grad_norm": 0.7818359421915334, + "learning_rate": 5.840113546617025e-05, + "loss": 0.8945, + "step": 4556 + }, + { + "epoch": 2.0657298277425205, + "grad_norm": 0.811134944073197, + "learning_rate": 5.838989476111834e-05, + "loss": 0.8745, + "step": 4557 + }, + { + "epoch": 2.0661831368993653, + "grad_norm": 0.6668574978698705, + "learning_rate": 5.837865221426443e-05, + "loss": 0.8939, + "step": 4558 + }, + { + "epoch": 2.0666364460562106, + "grad_norm": 0.5070878676460848, + "learning_rate": 5.836740782673447e-05, + "loss": 0.872, + "step": 4559 + }, + { + "epoch": 2.0670897552130554, + "grad_norm": 0.39768941522801304, + "learning_rate": 5.835616159965466e-05, + "loss": 0.8739, + "step": 4560 + }, + { + "epoch": 2.0675430643699, + "grad_norm": 0.4090047027500746, + "learning_rate": 5.83449135341513e-05, + "loss": 0.8731, + "step": 4561 + }, + { + "epoch": 2.067996373526745, + "grad_norm": 0.5616795952439557, + "learning_rate": 5.833366363135095e-05, + "loss": 0.8616, + "step": 4562 + }, + { + "epoch": 2.0684496826835903, + "grad_norm": 0.5731586609990232, + "learning_rate": 5.832241189238029e-05, + "loss": 0.8825, + "step": 4563 + }, + { + "epoch": 2.068902991840435, + "grad_norm": 0.6027908315821942, + "learning_rate": 5.8311158318366234e-05, + "loss": 0.8754, + "step": 4564 + }, + { + "epoch": 2.06935630099728, + "grad_norm": 0.6650342007694588, + "learning_rate": 5.829990291043586e-05, + "loss": 0.8769, + "step": 4565 + }, + { + "epoch": 2.0698096101541252, + "grad_norm": 0.6310788320956693, + "learning_rate": 5.828864566971644e-05, + "loss": 0.9011, + "step": 4566 + }, + { + "epoch": 2.07026291931097, + "grad_norm": 0.5442854482003806, + "learning_rate": 5.8277386597335394e-05, + "loss": 0.9013, + "step": 4567 + }, + { + "epoch": 2.070716228467815, + "grad_norm": 0.44439252470105706, + "learning_rate": 5.826612569442036e-05, + "loss": 0.8673, + "step": 4568 + }, + { + "epoch": 2.07116953762466, + "grad_norm": 0.3099995976809445, + "learning_rate": 5.8254862962099154e-05, + "loss": 0.8653, + "step": 4569 + }, + { + "epoch": 2.071622846781505, + "grad_norm": 0.309574362217333, + "learning_rate": 5.824359840149978e-05, + "loss": 0.8806, + "step": 4570 + }, + { + "epoch": 2.07207615593835, + "grad_norm": 0.3287322182327067, + "learning_rate": 5.8232332013750396e-05, + "loss": 0.8717, + "step": 4571 + }, + { + "epoch": 2.072529465095195, + "grad_norm": 0.35129000567021723, + "learning_rate": 5.822106379997939e-05, + "loss": 0.8911, + "step": 4572 + }, + { + "epoch": 2.07298277425204, + "grad_norm": 0.4563554889733326, + "learning_rate": 5.82097937613153e-05, + "loss": 0.8821, + "step": 4573 + }, + { + "epoch": 2.0734360834088847, + "grad_norm": 0.5430085354224156, + "learning_rate": 5.819852189888684e-05, + "loss": 0.8709, + "step": 4574 + }, + { + "epoch": 2.07388939256573, + "grad_norm": 0.5662771215757916, + "learning_rate": 5.818724821382292e-05, + "loss": 0.8762, + "step": 4575 + }, + { + "epoch": 2.074342701722575, + "grad_norm": 0.5102759691754044, + "learning_rate": 5.817597270725266e-05, + "loss": 0.8716, + "step": 4576 + }, + { + "epoch": 2.0747960108794197, + "grad_norm": 0.4655079205590319, + "learning_rate": 5.816469538030529e-05, + "loss": 0.8911, + "step": 4577 + }, + { + "epoch": 2.075249320036265, + "grad_norm": 0.40470728407168965, + "learning_rate": 5.81534162341103e-05, + "loss": 0.861, + "step": 4578 + }, + { + "epoch": 2.0757026291931098, + "grad_norm": 0.3589165193458115, + "learning_rate": 5.814213526979733e-05, + "loss": 0.8765, + "step": 4579 + }, + { + "epoch": 2.0761559383499546, + "grad_norm": 0.35522142436602666, + "learning_rate": 5.813085248849619e-05, + "loss": 0.8459, + "step": 4580 + }, + { + "epoch": 2.0766092475067994, + "grad_norm": 0.38097401187354657, + "learning_rate": 5.811956789133689e-05, + "loss": 0.8758, + "step": 4581 + }, + { + "epoch": 2.0770625566636447, + "grad_norm": 0.3828436982725671, + "learning_rate": 5.8108281479449605e-05, + "loss": 0.877, + "step": 4582 + }, + { + "epoch": 2.0775158658204895, + "grad_norm": 0.4578799673155848, + "learning_rate": 5.809699325396471e-05, + "loss": 0.8897, + "step": 4583 + }, + { + "epoch": 2.0779691749773344, + "grad_norm": 0.5057619807450817, + "learning_rate": 5.8085703216012775e-05, + "loss": 0.8876, + "step": 4584 + }, + { + "epoch": 2.0784224841341796, + "grad_norm": 0.453504883334568, + "learning_rate": 5.807441136672449e-05, + "loss": 0.8814, + "step": 4585 + }, + { + "epoch": 2.0788757932910245, + "grad_norm": 0.48533420181974285, + "learning_rate": 5.80631177072308e-05, + "loss": 0.8984, + "step": 4586 + }, + { + "epoch": 2.0793291024478693, + "grad_norm": 0.577738069182516, + "learning_rate": 5.8051822238662774e-05, + "loss": 0.8873, + "step": 4587 + }, + { + "epoch": 2.0797824116047146, + "grad_norm": 0.6316913766348288, + "learning_rate": 5.804052496215169e-05, + "loss": 0.8903, + "step": 4588 + }, + { + "epoch": 2.0802357207615594, + "grad_norm": 0.6148074297108407, + "learning_rate": 5.802922587882903e-05, + "loss": 0.8961, + "step": 4589 + }, + { + "epoch": 2.080689029918404, + "grad_norm": 0.556805303827392, + "learning_rate": 5.80179249898264e-05, + "loss": 0.858, + "step": 4590 + }, + { + "epoch": 2.0811423390752495, + "grad_norm": 0.502156930135495, + "learning_rate": 5.800662229627564e-05, + "loss": 0.8633, + "step": 4591 + }, + { + "epoch": 2.0815956482320943, + "grad_norm": 0.516861659447645, + "learning_rate": 5.799531779930872e-05, + "loss": 0.8878, + "step": 4592 + }, + { + "epoch": 2.082048957388939, + "grad_norm": 0.3959355591829808, + "learning_rate": 5.798401150005785e-05, + "loss": 0.8729, + "step": 4593 + }, + { + "epoch": 2.0825022665457844, + "grad_norm": 0.4152885924222527, + "learning_rate": 5.7972703399655376e-05, + "loss": 0.8773, + "step": 4594 + }, + { + "epoch": 2.0829555757026292, + "grad_norm": 0.45398682943708396, + "learning_rate": 5.796139349923382e-05, + "loss": 0.8679, + "step": 4595 + }, + { + "epoch": 2.083408884859474, + "grad_norm": 0.45204155280820224, + "learning_rate": 5.795008179992593e-05, + "loss": 0.8909, + "step": 4596 + }, + { + "epoch": 2.0838621940163193, + "grad_norm": 0.3683640763279323, + "learning_rate": 5.793876830286458e-05, + "loss": 0.8709, + "step": 4597 + }, + { + "epoch": 2.084315503173164, + "grad_norm": 0.3162672056126719, + "learning_rate": 5.792745300918287e-05, + "loss": 0.8735, + "step": 4598 + }, + { + "epoch": 2.084768812330009, + "grad_norm": 0.38447366146164946, + "learning_rate": 5.791613592001405e-05, + "loss": 0.8826, + "step": 4599 + }, + { + "epoch": 2.0852221214868543, + "grad_norm": 0.4657804155198531, + "learning_rate": 5.790481703649157e-05, + "loss": 0.8696, + "step": 4600 + }, + { + "epoch": 2.085675430643699, + "grad_norm": 0.46632116164070136, + "learning_rate": 5.789349635974905e-05, + "loss": 0.866, + "step": 4601 + }, + { + "epoch": 2.086128739800544, + "grad_norm": 0.40981533788970287, + "learning_rate": 5.788217389092027e-05, + "loss": 0.8738, + "step": 4602 + }, + { + "epoch": 2.0865820489573887, + "grad_norm": 0.4637229966897603, + "learning_rate": 5.787084963113923e-05, + "loss": 0.8736, + "step": 4603 + }, + { + "epoch": 2.087035358114234, + "grad_norm": 0.5656475493216265, + "learning_rate": 5.785952358154007e-05, + "loss": 0.8701, + "step": 4604 + }, + { + "epoch": 2.087488667271079, + "grad_norm": 0.6493699059029064, + "learning_rate": 5.784819574325713e-05, + "loss": 0.8663, + "step": 4605 + }, + { + "epoch": 2.0879419764279237, + "grad_norm": 0.6854208112126271, + "learning_rate": 5.783686611742494e-05, + "loss": 0.859, + "step": 4606 + }, + { + "epoch": 2.088395285584769, + "grad_norm": 0.5986992075995311, + "learning_rate": 5.782553470517818e-05, + "loss": 0.873, + "step": 4607 + }, + { + "epoch": 2.0888485947416138, + "grad_norm": 0.6034118560035834, + "learning_rate": 5.781420150765174e-05, + "loss": 0.8725, + "step": 4608 + }, + { + "epoch": 2.0893019038984586, + "grad_norm": 0.6086667079890786, + "learning_rate": 5.780286652598065e-05, + "loss": 0.8945, + "step": 4609 + }, + { + "epoch": 2.089755213055304, + "grad_norm": 0.5666511613107728, + "learning_rate": 5.7791529761300165e-05, + "loss": 0.8737, + "step": 4610 + }, + { + "epoch": 2.0902085222121487, + "grad_norm": 0.5312290562054499, + "learning_rate": 5.778019121474568e-05, + "loss": 0.8783, + "step": 4611 + }, + { + "epoch": 2.0906618313689935, + "grad_norm": 0.5310565173988439, + "learning_rate": 5.77688508874528e-05, + "loss": 0.8684, + "step": 4612 + }, + { + "epoch": 2.091115140525839, + "grad_norm": 0.4973459647449567, + "learning_rate": 5.7757508780557286e-05, + "loss": 0.8831, + "step": 4613 + }, + { + "epoch": 2.0915684496826836, + "grad_norm": 0.4386380950762032, + "learning_rate": 5.774616489519507e-05, + "loss": 0.8649, + "step": 4614 + }, + { + "epoch": 2.0920217588395285, + "grad_norm": 0.38225375114037674, + "learning_rate": 5.7734819232502284e-05, + "loss": 0.8629, + "step": 4615 + }, + { + "epoch": 2.0924750679963737, + "grad_norm": 0.329672259113762, + "learning_rate": 5.772347179361523e-05, + "loss": 0.8973, + "step": 4616 + }, + { + "epoch": 2.0929283771532186, + "grad_norm": 0.45749460914365775, + "learning_rate": 5.77121225796704e-05, + "loss": 0.8658, + "step": 4617 + }, + { + "epoch": 2.0933816863100634, + "grad_norm": 0.6076392942989817, + "learning_rate": 5.7700771591804426e-05, + "loss": 0.8696, + "step": 4618 + }, + { + "epoch": 2.093834995466908, + "grad_norm": 0.7130098202113795, + "learning_rate": 5.768941883115415e-05, + "loss": 0.8613, + "step": 4619 + }, + { + "epoch": 2.0942883046237535, + "grad_norm": 0.7070175174528951, + "learning_rate": 5.76780642988566e-05, + "loss": 0.8834, + "step": 4620 + }, + { + "epoch": 2.0947416137805983, + "grad_norm": 0.7106169698148995, + "learning_rate": 5.7666707996048954e-05, + "loss": 0.8893, + "step": 4621 + }, + { + "epoch": 2.095194922937443, + "grad_norm": 0.7414541704479877, + "learning_rate": 5.7655349923868584e-05, + "loss": 0.8788, + "step": 4622 + }, + { + "epoch": 2.0956482320942884, + "grad_norm": 1.0242800133771288, + "learning_rate": 5.7643990083453025e-05, + "loss": 0.863, + "step": 4623 + }, + { + "epoch": 2.0961015412511332, + "grad_norm": 0.7302534872712886, + "learning_rate": 5.763262847594e-05, + "loss": 0.8744, + "step": 4624 + }, + { + "epoch": 2.096554850407978, + "grad_norm": 0.6456197575490655, + "learning_rate": 5.762126510246741e-05, + "loss": 0.8872, + "step": 4625 + }, + { + "epoch": 2.0970081595648233, + "grad_norm": 0.5871330229244505, + "learning_rate": 5.760989996417335e-05, + "loss": 0.865, + "step": 4626 + }, + { + "epoch": 2.097461468721668, + "grad_norm": 0.5192944047824262, + "learning_rate": 5.759853306219604e-05, + "loss": 0.8751, + "step": 4627 + }, + { + "epoch": 2.097914777878513, + "grad_norm": 0.3839679482229879, + "learning_rate": 5.758716439767392e-05, + "loss": 0.8725, + "step": 4628 + }, + { + "epoch": 2.0983680870353583, + "grad_norm": 0.39742051042731424, + "learning_rate": 5.757579397174561e-05, + "loss": 0.8744, + "step": 4629 + }, + { + "epoch": 2.098821396192203, + "grad_norm": 0.4224738640286207, + "learning_rate": 5.756442178554988e-05, + "loss": 0.8713, + "step": 4630 + }, + { + "epoch": 2.099274705349048, + "grad_norm": 0.47316581023903964, + "learning_rate": 5.755304784022568e-05, + "loss": 0.8921, + "step": 4631 + }, + { + "epoch": 2.099728014505893, + "grad_norm": 0.5350488058782479, + "learning_rate": 5.7541672136912164e-05, + "loss": 0.8673, + "step": 4632 + }, + { + "epoch": 2.100181323662738, + "grad_norm": 0.552030295331295, + "learning_rate": 5.7530294676748605e-05, + "loss": 0.8879, + "step": 4633 + }, + { + "epoch": 2.100634632819583, + "grad_norm": 0.5863362137388385, + "learning_rate": 5.751891546087453e-05, + "loss": 0.8792, + "step": 4634 + }, + { + "epoch": 2.101087941976428, + "grad_norm": 0.6127320480122495, + "learning_rate": 5.7507534490429574e-05, + "loss": 0.8868, + "step": 4635 + }, + { + "epoch": 2.101541251133273, + "grad_norm": 0.6424313048407116, + "learning_rate": 5.7496151766553595e-05, + "loss": 0.8887, + "step": 4636 + }, + { + "epoch": 2.1019945602901178, + "grad_norm": 0.6566501264031506, + "learning_rate": 5.7484767290386596e-05, + "loss": 0.8797, + "step": 4637 + }, + { + "epoch": 2.102447869446963, + "grad_norm": 0.7176836088906126, + "learning_rate": 5.747338106306876e-05, + "loss": 0.881, + "step": 4638 + }, + { + "epoch": 2.102901178603808, + "grad_norm": 0.6994836306358369, + "learning_rate": 5.746199308574046e-05, + "loss": 0.8751, + "step": 4639 + }, + { + "epoch": 2.1033544877606527, + "grad_norm": 0.7719690254384831, + "learning_rate": 5.745060335954223e-05, + "loss": 0.9086, + "step": 4640 + }, + { + "epoch": 2.1038077969174975, + "grad_norm": 0.8030776339876178, + "learning_rate": 5.7439211885614804e-05, + "loss": 0.8799, + "step": 4641 + }, + { + "epoch": 2.104261106074343, + "grad_norm": 0.8267811702230194, + "learning_rate": 5.7427818665099034e-05, + "loss": 0.8832, + "step": 4642 + }, + { + "epoch": 2.1047144152311876, + "grad_norm": 0.7852534695405192, + "learning_rate": 5.741642369913601e-05, + "loss": 0.889, + "step": 4643 + }, + { + "epoch": 2.1051677243880325, + "grad_norm": 0.7826247686699569, + "learning_rate": 5.740502698886697e-05, + "loss": 0.8749, + "step": 4644 + }, + { + "epoch": 2.1056210335448777, + "grad_norm": 0.7573024335328822, + "learning_rate": 5.739362853543333e-05, + "loss": 0.8777, + "step": 4645 + }, + { + "epoch": 2.1060743427017226, + "grad_norm": 0.673825880883513, + "learning_rate": 5.738222833997667e-05, + "loss": 0.8491, + "step": 4646 + }, + { + "epoch": 2.1065276518585674, + "grad_norm": 0.5259398585553748, + "learning_rate": 5.7370826403638746e-05, + "loss": 0.8815, + "step": 4647 + }, + { + "epoch": 2.1069809610154127, + "grad_norm": 0.755604763073318, + "learning_rate": 5.735942272756151e-05, + "loss": 0.9475, + "step": 4648 + }, + { + "epoch": 2.1074342701722575, + "grad_norm": 0.5010554695851565, + "learning_rate": 5.734801731288707e-05, + "loss": 0.8791, + "step": 4649 + }, + { + "epoch": 2.1078875793291023, + "grad_norm": 0.5142385752072223, + "learning_rate": 5.733661016075772e-05, + "loss": 0.8888, + "step": 4650 + }, + { + "epoch": 2.1083408884859476, + "grad_norm": 0.5351909018832351, + "learning_rate": 5.73252012723159e-05, + "loss": 0.884, + "step": 4651 + }, + { + "epoch": 2.1087941976427924, + "grad_norm": 0.6233419691585887, + "learning_rate": 5.731379064870426e-05, + "loss": 0.873, + "step": 4652 + }, + { + "epoch": 2.1092475067996372, + "grad_norm": 0.748858692651273, + "learning_rate": 5.730237829106561e-05, + "loss": 0.8815, + "step": 4653 + }, + { + "epoch": 2.1097008159564825, + "grad_norm": 0.8036776360522795, + "learning_rate": 5.729096420054291e-05, + "loss": 0.8892, + "step": 4654 + }, + { + "epoch": 2.1101541251133273, + "grad_norm": 0.8288316422191436, + "learning_rate": 5.7279548378279335e-05, + "loss": 0.8806, + "step": 4655 + }, + { + "epoch": 2.110607434270172, + "grad_norm": 0.7699178052207398, + "learning_rate": 5.72681308254182e-05, + "loss": 0.8842, + "step": 4656 + }, + { + "epoch": 2.1110607434270174, + "grad_norm": 0.6384327740083723, + "learning_rate": 5.725671154310302e-05, + "loss": 0.8911, + "step": 4657 + }, + { + "epoch": 2.1115140525838623, + "grad_norm": 0.46565787022588595, + "learning_rate": 5.724529053247747e-05, + "loss": 0.8864, + "step": 4658 + }, + { + "epoch": 2.111967361740707, + "grad_norm": 0.3737511507919331, + "learning_rate": 5.7233867794685367e-05, + "loss": 0.8767, + "step": 4659 + }, + { + "epoch": 2.112420670897552, + "grad_norm": 0.3841361612981083, + "learning_rate": 5.7222443330870775e-05, + "loss": 0.8526, + "step": 4660 + }, + { + "epoch": 2.112873980054397, + "grad_norm": 0.5335958615519514, + "learning_rate": 5.7211017142177866e-05, + "loss": 0.8915, + "step": 4661 + }, + { + "epoch": 2.113327289211242, + "grad_norm": 0.715796301500934, + "learning_rate": 5.7199589229750996e-05, + "loss": 0.8949, + "step": 4662 + }, + { + "epoch": 2.113780598368087, + "grad_norm": 0.7890551427276133, + "learning_rate": 5.718815959473472e-05, + "loss": 0.841, + "step": 4663 + }, + { + "epoch": 2.114233907524932, + "grad_norm": 0.8191190002768579, + "learning_rate": 5.717672823827375e-05, + "loss": 0.8743, + "step": 4664 + }, + { + "epoch": 2.114687216681777, + "grad_norm": 0.754121626944105, + "learning_rate": 5.716529516151296e-05, + "loss": 0.8856, + "step": 4665 + }, + { + "epoch": 2.1151405258386218, + "grad_norm": 0.7140645830670148, + "learning_rate": 5.715386036559741e-05, + "loss": 0.8963, + "step": 4666 + }, + { + "epoch": 2.115593834995467, + "grad_norm": 0.5428496539405008, + "learning_rate": 5.714242385167234e-05, + "loss": 0.8556, + "step": 4667 + }, + { + "epoch": 2.116047144152312, + "grad_norm": 0.4631952403846576, + "learning_rate": 5.713098562088313e-05, + "loss": 0.8875, + "step": 4668 + }, + { + "epoch": 2.1165004533091567, + "grad_norm": 0.34090913920558635, + "learning_rate": 5.711954567437537e-05, + "loss": 0.8707, + "step": 4669 + }, + { + "epoch": 2.116953762466002, + "grad_norm": 0.41471938971692585, + "learning_rate": 5.7108104013294806e-05, + "loss": 0.877, + "step": 4670 + }, + { + "epoch": 2.117407071622847, + "grad_norm": 0.5315284875166865, + "learning_rate": 5.709666063878733e-05, + "loss": 0.8824, + "step": 4671 + }, + { + "epoch": 2.1178603807796916, + "grad_norm": 0.5788461797531296, + "learning_rate": 5.7085215551999054e-05, + "loss": 0.9043, + "step": 4672 + }, + { + "epoch": 2.118313689936537, + "grad_norm": 0.543368707929793, + "learning_rate": 5.7073768754076226e-05, + "loss": 0.8804, + "step": 4673 + }, + { + "epoch": 2.1187669990933817, + "grad_norm": 0.5464838963053923, + "learning_rate": 5.7062320246165276e-05, + "loss": 0.8869, + "step": 4674 + }, + { + "epoch": 2.1192203082502266, + "grad_norm": 0.5916976530787661, + "learning_rate": 5.70508700294128e-05, + "loss": 0.9147, + "step": 4675 + }, + { + "epoch": 2.119673617407072, + "grad_norm": 0.5264703572517389, + "learning_rate": 5.7039418104965594e-05, + "loss": 0.8507, + "step": 4676 + }, + { + "epoch": 2.1201269265639167, + "grad_norm": 0.4226319399308346, + "learning_rate": 5.702796447397058e-05, + "loss": 0.8832, + "step": 4677 + }, + { + "epoch": 2.1205802357207615, + "grad_norm": 0.3720407034145489, + "learning_rate": 5.701650913757488e-05, + "loss": 0.8806, + "step": 4678 + }, + { + "epoch": 2.1210335448776063, + "grad_norm": 0.3282733693813455, + "learning_rate": 5.700505209692578e-05, + "loss": 0.8687, + "step": 4679 + }, + { + "epoch": 2.1214868540344516, + "grad_norm": 0.3826340974845139, + "learning_rate": 5.6993593353170736e-05, + "loss": 0.8874, + "step": 4680 + }, + { + "epoch": 2.1219401631912964, + "grad_norm": 0.39252324899980395, + "learning_rate": 5.698213290745737e-05, + "loss": 0.8718, + "step": 4681 + }, + { + "epoch": 2.1223934723481412, + "grad_norm": 0.45171116029539343, + "learning_rate": 5.6970670760933496e-05, + "loss": 0.8841, + "step": 4682 + }, + { + "epoch": 2.1228467815049865, + "grad_norm": 0.5291102940020158, + "learning_rate": 5.6959206914747056e-05, + "loss": 0.8965, + "step": 4683 + }, + { + "epoch": 2.1233000906618313, + "grad_norm": 0.5191171476061135, + "learning_rate": 5.694774137004621e-05, + "loss": 0.883, + "step": 4684 + }, + { + "epoch": 2.123753399818676, + "grad_norm": 0.43743325851989967, + "learning_rate": 5.6936274127979246e-05, + "loss": 0.8867, + "step": 4685 + }, + { + "epoch": 2.1242067089755214, + "grad_norm": 0.4022423752788162, + "learning_rate": 5.692480518969467e-05, + "loss": 0.8739, + "step": 4686 + }, + { + "epoch": 2.1246600181323663, + "grad_norm": 0.3709371077010507, + "learning_rate": 5.6913334556341095e-05, + "loss": 0.877, + "step": 4687 + }, + { + "epoch": 2.125113327289211, + "grad_norm": 0.37825014117279887, + "learning_rate": 5.690186222906737e-05, + "loss": 0.8816, + "step": 4688 + }, + { + "epoch": 2.1255666364460564, + "grad_norm": 0.3487995961534407, + "learning_rate": 5.689038820902247e-05, + "loss": 0.8581, + "step": 4689 + }, + { + "epoch": 2.126019945602901, + "grad_norm": 0.31771199042000287, + "learning_rate": 5.687891249735554e-05, + "loss": 0.9051, + "step": 4690 + }, + { + "epoch": 2.126473254759746, + "grad_norm": 0.3504111789053097, + "learning_rate": 5.686743509521592e-05, + "loss": 0.8874, + "step": 4691 + }, + { + "epoch": 2.1269265639165913, + "grad_norm": 0.4908591847308177, + "learning_rate": 5.685595600375311e-05, + "loss": 0.8733, + "step": 4692 + }, + { + "epoch": 2.127379873073436, + "grad_norm": 0.6694949441011244, + "learning_rate": 5.684447522411676e-05, + "loss": 0.8862, + "step": 4693 + }, + { + "epoch": 2.127833182230281, + "grad_norm": 0.7598317466381515, + "learning_rate": 5.683299275745672e-05, + "loss": 0.8714, + "step": 4694 + }, + { + "epoch": 2.1282864913871262, + "grad_norm": 0.7715605612079511, + "learning_rate": 5.6821508604922965e-05, + "loss": 0.8677, + "step": 4695 + }, + { + "epoch": 2.128739800543971, + "grad_norm": 0.824680887745544, + "learning_rate": 5.68100227676657e-05, + "loss": 0.891, + "step": 4696 + }, + { + "epoch": 2.129193109700816, + "grad_norm": 0.6423339657145106, + "learning_rate": 5.679853524683525e-05, + "loss": 0.8658, + "step": 4697 + }, + { + "epoch": 2.1296464188576607, + "grad_norm": 0.5633415149290758, + "learning_rate": 5.6787046043582125e-05, + "loss": 0.8953, + "step": 4698 + }, + { + "epoch": 2.130099728014506, + "grad_norm": 0.5071142237217281, + "learning_rate": 5.677555515905701e-05, + "loss": 0.8711, + "step": 4699 + }, + { + "epoch": 2.130553037171351, + "grad_norm": 0.5596001505108458, + "learning_rate": 5.6764062594410734e-05, + "loss": 0.8566, + "step": 4700 + }, + { + "epoch": 2.1310063463281956, + "grad_norm": 0.5493285937010377, + "learning_rate": 5.6752568350794326e-05, + "loss": 0.8596, + "step": 4701 + }, + { + "epoch": 2.131459655485041, + "grad_norm": 0.5230061026785852, + "learning_rate": 5.674107242935896e-05, + "loss": 0.9018, + "step": 4702 + }, + { + "epoch": 2.1319129646418857, + "grad_norm": 0.5465203435438957, + "learning_rate": 5.672957483125599e-05, + "loss": 0.8721, + "step": 4703 + }, + { + "epoch": 2.1323662737987306, + "grad_norm": 0.5954743975335948, + "learning_rate": 5.6718075557636924e-05, + "loss": 0.8737, + "step": 4704 + }, + { + "epoch": 2.132819582955576, + "grad_norm": 0.6693179218705259, + "learning_rate": 5.670657460965347e-05, + "loss": 0.8993, + "step": 4705 + }, + { + "epoch": 2.1332728921124207, + "grad_norm": 0.748755955699696, + "learning_rate": 5.669507198845747e-05, + "loss": 0.8875, + "step": 4706 + }, + { + "epoch": 2.1337262012692655, + "grad_norm": 0.7707863937291888, + "learning_rate": 5.668356769520093e-05, + "loss": 0.8701, + "step": 4707 + }, + { + "epoch": 2.1341795104261108, + "grad_norm": 0.7309538630188778, + "learning_rate": 5.667206173103607e-05, + "loss": 0.9016, + "step": 4708 + }, + { + "epoch": 2.1346328195829556, + "grad_norm": 0.8455954425669935, + "learning_rate": 5.666055409711522e-05, + "loss": 0.9047, + "step": 4709 + }, + { + "epoch": 2.1350861287398004, + "grad_norm": 0.48456231383610604, + "learning_rate": 5.6649044794590914e-05, + "loss": 0.8918, + "step": 4710 + }, + { + "epoch": 2.1355394378966457, + "grad_norm": 0.4401363000421173, + "learning_rate": 5.6637533824615846e-05, + "loss": 0.8708, + "step": 4711 + }, + { + "epoch": 2.1359927470534905, + "grad_norm": 0.4391245453627379, + "learning_rate": 5.662602118834286e-05, + "loss": 0.8848, + "step": 4712 + }, + { + "epoch": 2.1364460562103353, + "grad_norm": 0.45205828876205834, + "learning_rate": 5.661450688692498e-05, + "loss": 0.8519, + "step": 4713 + }, + { + "epoch": 2.1368993653671806, + "grad_norm": 0.3871553815812103, + "learning_rate": 5.660299092151542e-05, + "loss": 0.8629, + "step": 4714 + }, + { + "epoch": 2.1373526745240254, + "grad_norm": 0.34404314281632276, + "learning_rate": 5.659147329326754e-05, + "loss": 0.8647, + "step": 4715 + }, + { + "epoch": 2.1378059836808703, + "grad_norm": 0.41403930625900875, + "learning_rate": 5.657995400333482e-05, + "loss": 0.8922, + "step": 4716 + }, + { + "epoch": 2.1382592928377155, + "grad_norm": 0.5328621123691513, + "learning_rate": 5.656843305287099e-05, + "loss": 0.8777, + "step": 4717 + }, + { + "epoch": 2.1387126019945604, + "grad_norm": 0.6036552244247506, + "learning_rate": 5.655691044302991e-05, + "loss": 0.8755, + "step": 4718 + }, + { + "epoch": 2.139165911151405, + "grad_norm": 0.6036307864365794, + "learning_rate": 5.654538617496557e-05, + "loss": 0.8608, + "step": 4719 + }, + { + "epoch": 2.13961922030825, + "grad_norm": 0.5761002532993573, + "learning_rate": 5.6533860249832186e-05, + "loss": 0.8692, + "step": 4720 + }, + { + "epoch": 2.1400725294650953, + "grad_norm": 0.5656994633887272, + "learning_rate": 5.6522332668784106e-05, + "loss": 0.9026, + "step": 4721 + }, + { + "epoch": 2.14052583862194, + "grad_norm": 0.6040761073323289, + "learning_rate": 5.651080343297584e-05, + "loss": 0.8958, + "step": 4722 + }, + { + "epoch": 2.140979147778785, + "grad_norm": 0.6034345732453592, + "learning_rate": 5.6499272543562096e-05, + "loss": 0.8965, + "step": 4723 + }, + { + "epoch": 2.1414324569356302, + "grad_norm": 0.8640570450195241, + "learning_rate": 5.648774000169772e-05, + "loss": 0.8948, + "step": 4724 + }, + { + "epoch": 2.141885766092475, + "grad_norm": 0.5756907102983703, + "learning_rate": 5.647620580853772e-05, + "loss": 0.8694, + "step": 4725 + }, + { + "epoch": 2.14233907524932, + "grad_norm": 0.5894778696563956, + "learning_rate": 5.646466996523728e-05, + "loss": 0.8839, + "step": 4726 + }, + { + "epoch": 2.142792384406165, + "grad_norm": 0.5771008803546869, + "learning_rate": 5.645313247295176e-05, + "loss": 0.9251, + "step": 4727 + }, + { + "epoch": 2.14324569356301, + "grad_norm": 0.5353704086233648, + "learning_rate": 5.6441593332836676e-05, + "loss": 0.8801, + "step": 4728 + }, + { + "epoch": 2.143699002719855, + "grad_norm": 0.49326795424486186, + "learning_rate": 5.6430052546047686e-05, + "loss": 0.8939, + "step": 4729 + }, + { + "epoch": 2.1441523118767, + "grad_norm": 0.35613806476637405, + "learning_rate": 5.6418510113740664e-05, + "loss": 0.8879, + "step": 4730 + }, + { + "epoch": 2.144605621033545, + "grad_norm": 0.323313611906575, + "learning_rate": 5.640696603707159e-05, + "loss": 0.884, + "step": 4731 + }, + { + "epoch": 2.1450589301903897, + "grad_norm": 0.35678861587255445, + "learning_rate": 5.639542031719664e-05, + "loss": 0.8797, + "step": 4732 + }, + { + "epoch": 2.145512239347235, + "grad_norm": 0.34830692925084633, + "learning_rate": 5.6383872955272175e-05, + "loss": 0.8992, + "step": 4733 + }, + { + "epoch": 2.14596554850408, + "grad_norm": 0.38182615973434697, + "learning_rate": 5.637232395245467e-05, + "loss": 0.9134, + "step": 4734 + }, + { + "epoch": 2.1464188576609247, + "grad_norm": 0.48956836206613963, + "learning_rate": 5.6360773309900804e-05, + "loss": 0.8857, + "step": 4735 + }, + { + "epoch": 2.1468721668177695, + "grad_norm": 0.5512286064207162, + "learning_rate": 5.634922102876741e-05, + "loss": 0.8804, + "step": 4736 + }, + { + "epoch": 2.1473254759746148, + "grad_norm": 0.6452385131548105, + "learning_rate": 5.633766711021149e-05, + "loss": 0.9008, + "step": 4737 + }, + { + "epoch": 2.1477787851314596, + "grad_norm": 0.6446461657389668, + "learning_rate": 5.6326111555390186e-05, + "loss": 0.8821, + "step": 4738 + }, + { + "epoch": 2.1482320942883044, + "grad_norm": 0.651388281279698, + "learning_rate": 5.631455436546082e-05, + "loss": 0.8885, + "step": 4739 + }, + { + "epoch": 2.1486854034451497, + "grad_norm": 0.6616127483005828, + "learning_rate": 5.63029955415809e-05, + "loss": 0.8664, + "step": 4740 + }, + { + "epoch": 2.1491387126019945, + "grad_norm": 0.7127597259487963, + "learning_rate": 5.629143508490806e-05, + "loss": 0.8584, + "step": 4741 + }, + { + "epoch": 2.1495920217588393, + "grad_norm": 0.752114531040743, + "learning_rate": 5.6279872996600114e-05, + "loss": 0.874, + "step": 4742 + }, + { + "epoch": 2.1500453309156846, + "grad_norm": 0.7936736970466485, + "learning_rate": 5.6268309277815046e-05, + "loss": 0.8675, + "step": 4743 + }, + { + "epoch": 2.1504986400725294, + "grad_norm": 0.7522376111075707, + "learning_rate": 5.625674392971099e-05, + "loss": 0.869, + "step": 4744 + }, + { + "epoch": 2.1509519492293743, + "grad_norm": 0.6510091707473514, + "learning_rate": 5.624517695344624e-05, + "loss": 0.8658, + "step": 4745 + }, + { + "epoch": 2.1514052583862195, + "grad_norm": 0.49174446368378294, + "learning_rate": 5.623360835017929e-05, + "loss": 0.8914, + "step": 4746 + }, + { + "epoch": 2.1518585675430644, + "grad_norm": 0.45264892295694853, + "learning_rate": 5.622203812106875e-05, + "loss": 0.8801, + "step": 4747 + }, + { + "epoch": 2.152311876699909, + "grad_norm": 0.4636267984067917, + "learning_rate": 5.621046626727342e-05, + "loss": 0.8917, + "step": 4748 + }, + { + "epoch": 2.1527651858567545, + "grad_norm": 0.42840887927397775, + "learning_rate": 5.6198892789952244e-05, + "loss": 0.8686, + "step": 4749 + }, + { + "epoch": 2.1532184950135993, + "grad_norm": 0.4070906925043849, + "learning_rate": 5.618731769026434e-05, + "loss": 0.874, + "step": 4750 + }, + { + "epoch": 2.153671804170444, + "grad_norm": 0.3696458534859685, + "learning_rate": 5.617574096936901e-05, + "loss": 0.8614, + "step": 4751 + }, + { + "epoch": 2.1541251133272894, + "grad_norm": 0.30376194921682864, + "learning_rate": 5.616416262842568e-05, + "loss": 0.8793, + "step": 4752 + }, + { + "epoch": 2.1545784224841342, + "grad_norm": 0.2715216726244174, + "learning_rate": 5.615258266859396e-05, + "loss": 0.8853, + "step": 4753 + }, + { + "epoch": 2.155031731640979, + "grad_norm": 0.27721574019280637, + "learning_rate": 5.6141001091033605e-05, + "loss": 0.8702, + "step": 4754 + }, + { + "epoch": 2.1554850407978243, + "grad_norm": 0.3040289935158504, + "learning_rate": 5.612941789690455e-05, + "loss": 0.889, + "step": 4755 + }, + { + "epoch": 2.155938349954669, + "grad_norm": 0.2819501916778873, + "learning_rate": 5.6117833087366905e-05, + "loss": 0.8635, + "step": 4756 + }, + { + "epoch": 2.156391659111514, + "grad_norm": 0.2739266064855883, + "learning_rate": 5.61062466635809e-05, + "loss": 0.8815, + "step": 4757 + }, + { + "epoch": 2.1568449682683593, + "grad_norm": 0.33205340164607056, + "learning_rate": 5.609465862670696e-05, + "loss": 0.8859, + "step": 4758 + }, + { + "epoch": 2.157298277425204, + "grad_norm": 0.39087654516880016, + "learning_rate": 5.608306897790565e-05, + "loss": 0.877, + "step": 4759 + }, + { + "epoch": 2.157751586582049, + "grad_norm": 0.447105850088045, + "learning_rate": 5.607147771833772e-05, + "loss": 0.8647, + "step": 4760 + }, + { + "epoch": 2.1582048957388937, + "grad_norm": 0.5320308027987666, + "learning_rate": 5.605988484916405e-05, + "loss": 0.8654, + "step": 4761 + }, + { + "epoch": 2.158658204895739, + "grad_norm": 0.587378137256658, + "learning_rate": 5.6048290371545726e-05, + "loss": 0.8876, + "step": 4762 + }, + { + "epoch": 2.159111514052584, + "grad_norm": 0.6226268668221105, + "learning_rate": 5.6036694286643954e-05, + "loss": 0.8688, + "step": 4763 + }, + { + "epoch": 2.1595648232094287, + "grad_norm": 0.71205327854182, + "learning_rate": 5.602509659562011e-05, + "loss": 0.879, + "step": 4764 + }, + { + "epoch": 2.160018132366274, + "grad_norm": 0.7108317566602127, + "learning_rate": 5.601349729963575e-05, + "loss": 0.8968, + "step": 4765 + }, + { + "epoch": 2.1604714415231188, + "grad_norm": 0.6807788935354908, + "learning_rate": 5.600189639985258e-05, + "loss": 0.8942, + "step": 4766 + }, + { + "epoch": 2.1609247506799636, + "grad_norm": 0.6273927691739125, + "learning_rate": 5.599029389743246e-05, + "loss": 0.8818, + "step": 4767 + }, + { + "epoch": 2.161378059836809, + "grad_norm": 0.5691993445193647, + "learning_rate": 5.59786897935374e-05, + "loss": 0.869, + "step": 4768 + }, + { + "epoch": 2.1618313689936537, + "grad_norm": 0.516323479336695, + "learning_rate": 5.5967084089329603e-05, + "loss": 0.8638, + "step": 4769 + }, + { + "epoch": 2.1622846781504985, + "grad_norm": 0.5024972254295083, + "learning_rate": 5.59554767859714e-05, + "loss": 0.9023, + "step": 4770 + }, + { + "epoch": 2.162737987307344, + "grad_norm": 0.44222544669750213, + "learning_rate": 5.5943867884625314e-05, + "loss": 0.8976, + "step": 4771 + }, + { + "epoch": 2.1631912964641886, + "grad_norm": 0.41875181749299156, + "learning_rate": 5.5932257386454e-05, + "loss": 0.8787, + "step": 4772 + }, + { + "epoch": 2.1636446056210334, + "grad_norm": 0.36381104414288074, + "learning_rate": 5.592064529262028e-05, + "loss": 0.879, + "step": 4773 + }, + { + "epoch": 2.1640979147778787, + "grad_norm": 0.29929662721714306, + "learning_rate": 5.590903160428715e-05, + "loss": 0.8854, + "step": 4774 + }, + { + "epoch": 2.1645512239347235, + "grad_norm": 0.3255872603974135, + "learning_rate": 5.589741632261774e-05, + "loss": 0.8462, + "step": 4775 + }, + { + "epoch": 2.1650045330915684, + "grad_norm": 0.4120698753158708, + "learning_rate": 5.588579944877538e-05, + "loss": 0.8876, + "step": 4776 + }, + { + "epoch": 2.165457842248413, + "grad_norm": 0.472996073614943, + "learning_rate": 5.58741809839235e-05, + "loss": 0.8581, + "step": 4777 + }, + { + "epoch": 2.1659111514052585, + "grad_norm": 0.5114191315501465, + "learning_rate": 5.5862560929225745e-05, + "loss": 0.869, + "step": 4778 + }, + { + "epoch": 2.1663644605621033, + "grad_norm": 0.5954189408330235, + "learning_rate": 5.5850939285845896e-05, + "loss": 0.8769, + "step": 4779 + }, + { + "epoch": 2.166817769718948, + "grad_norm": 0.7019902537388452, + "learning_rate": 5.5839316054947884e-05, + "loss": 0.8768, + "step": 4780 + }, + { + "epoch": 2.1672710788757934, + "grad_norm": 0.7846005442643376, + "learning_rate": 5.582769123769583e-05, + "loss": 0.8828, + "step": 4781 + }, + { + "epoch": 2.1677243880326382, + "grad_norm": 0.7981698768835164, + "learning_rate": 5.581606483525399e-05, + "loss": 0.8763, + "step": 4782 + }, + { + "epoch": 2.168177697189483, + "grad_norm": 0.7057584252342092, + "learning_rate": 5.5804436848786755e-05, + "loss": 0.8695, + "step": 4783 + }, + { + "epoch": 2.1686310063463283, + "grad_norm": 0.5726923723822434, + "learning_rate": 5.579280727945874e-05, + "loss": 0.8958, + "step": 4784 + }, + { + "epoch": 2.169084315503173, + "grad_norm": 0.4723522477538353, + "learning_rate": 5.5781176128434645e-05, + "loss": 0.8849, + "step": 4785 + }, + { + "epoch": 2.169537624660018, + "grad_norm": 0.3680109338250474, + "learning_rate": 5.576954339687938e-05, + "loss": 0.8952, + "step": 4786 + }, + { + "epoch": 2.1699909338168633, + "grad_norm": 0.3588484037761595, + "learning_rate": 5.5757909085958016e-05, + "loss": 0.8959, + "step": 4787 + }, + { + "epoch": 2.170444242973708, + "grad_norm": 0.5205557014332567, + "learning_rate": 5.574627319683573e-05, + "loss": 0.8925, + "step": 4788 + }, + { + "epoch": 2.170897552130553, + "grad_norm": 0.46398384731607817, + "learning_rate": 5.573463573067791e-05, + "loss": 0.8659, + "step": 4789 + }, + { + "epoch": 2.171350861287398, + "grad_norm": 0.5770977296515021, + "learning_rate": 5.5722996688650076e-05, + "loss": 0.8919, + "step": 4790 + }, + { + "epoch": 2.171804170444243, + "grad_norm": 0.5753324509141756, + "learning_rate": 5.571135607191792e-05, + "loss": 0.8856, + "step": 4791 + }, + { + "epoch": 2.172257479601088, + "grad_norm": 0.5193603289690785, + "learning_rate": 5.5699713881647274e-05, + "loss": 0.892, + "step": 4792 + }, + { + "epoch": 2.172710788757933, + "grad_norm": 0.5461966879187565, + "learning_rate": 5.568807011900414e-05, + "loss": 0.8948, + "step": 4793 + }, + { + "epoch": 2.173164097914778, + "grad_norm": 1.3883760874905469, + "learning_rate": 5.5676424785154677e-05, + "loss": 0.8663, + "step": 4794 + }, + { + "epoch": 2.1736174070716228, + "grad_norm": 0.25862892642773355, + "learning_rate": 5.5664777881265214e-05, + "loss": 0.8856, + "step": 4795 + }, + { + "epoch": 2.174070716228468, + "grad_norm": 0.6416874000277034, + "learning_rate": 5.56531294085022e-05, + "loss": 0.8712, + "step": 4796 + }, + { + "epoch": 2.174524025385313, + "grad_norm": 1.107366747261881, + "learning_rate": 5.564147936803226e-05, + "loss": 0.8731, + "step": 4797 + }, + { + "epoch": 2.1749773345421577, + "grad_norm": 1.0369737954831324, + "learning_rate": 5.56298277610222e-05, + "loss": 0.8543, + "step": 4798 + }, + { + "epoch": 2.1754306436990025, + "grad_norm": 0.7235231035832544, + "learning_rate": 5.5618174588638965e-05, + "loss": 0.8755, + "step": 4799 + }, + { + "epoch": 2.175883952855848, + "grad_norm": 0.4854571881133867, + "learning_rate": 5.5606519852049634e-05, + "loss": 0.893, + "step": 4800 + }, + { + "epoch": 2.1763372620126926, + "grad_norm": 0.483701135145525, + "learning_rate": 5.559486355242147e-05, + "loss": 0.8828, + "step": 4801 + }, + { + "epoch": 2.1767905711695374, + "grad_norm": 0.5580550946905651, + "learning_rate": 5.558320569092189e-05, + "loss": 0.8892, + "step": 4802 + }, + { + "epoch": 2.1772438803263827, + "grad_norm": 0.6659739002695799, + "learning_rate": 5.557154626871847e-05, + "loss": 0.8739, + "step": 4803 + }, + { + "epoch": 2.1776971894832275, + "grad_norm": 0.7555627234308688, + "learning_rate": 5.5559885286978925e-05, + "loss": 0.8856, + "step": 4804 + }, + { + "epoch": 2.1781504986400724, + "grad_norm": 0.7521435613717428, + "learning_rate": 5.5548222746871143e-05, + "loss": 0.8742, + "step": 4805 + }, + { + "epoch": 2.1786038077969176, + "grad_norm": 0.6485901640816881, + "learning_rate": 5.5536558649563145e-05, + "loss": 0.8965, + "step": 4806 + }, + { + "epoch": 2.1790571169537625, + "grad_norm": 0.44045502425760796, + "learning_rate": 5.552489299622314e-05, + "loss": 0.8783, + "step": 4807 + }, + { + "epoch": 2.1795104261106073, + "grad_norm": 0.36068432023509794, + "learning_rate": 5.551322578801948e-05, + "loss": 0.8918, + "step": 4808 + }, + { + "epoch": 2.1799637352674526, + "grad_norm": 0.4422289096087341, + "learning_rate": 5.550155702612065e-05, + "loss": 0.8668, + "step": 4809 + }, + { + "epoch": 2.1804170444242974, + "grad_norm": 0.5130832356770436, + "learning_rate": 5.5489886711695345e-05, + "loss": 0.878, + "step": 4810 + }, + { + "epoch": 2.1808703535811422, + "grad_norm": 0.5740766339924029, + "learning_rate": 5.547821484591235e-05, + "loss": 0.8767, + "step": 4811 + }, + { + "epoch": 2.1813236627379875, + "grad_norm": 0.6407322148816381, + "learning_rate": 5.546654142994065e-05, + "loss": 0.8632, + "step": 4812 + }, + { + "epoch": 2.1817769718948323, + "grad_norm": 0.691986208068822, + "learning_rate": 5.5454866464949385e-05, + "loss": 0.8747, + "step": 4813 + }, + { + "epoch": 2.182230281051677, + "grad_norm": 0.7589218133949989, + "learning_rate": 5.5443189952107806e-05, + "loss": 0.8753, + "step": 4814 + }, + { + "epoch": 2.182683590208522, + "grad_norm": 0.7744665608296675, + "learning_rate": 5.543151189258538e-05, + "loss": 0.8756, + "step": 4815 + }, + { + "epoch": 2.1831368993653673, + "grad_norm": 0.63504006344866, + "learning_rate": 5.5419832287551675e-05, + "loss": 0.8677, + "step": 4816 + }, + { + "epoch": 2.183590208522212, + "grad_norm": 0.484932334984617, + "learning_rate": 5.5408151138176446e-05, + "loss": 0.8879, + "step": 4817 + }, + { + "epoch": 2.184043517679057, + "grad_norm": 0.39851769587198527, + "learning_rate": 5.539646844562961e-05, + "loss": 0.8709, + "step": 4818 + }, + { + "epoch": 2.184496826835902, + "grad_norm": 0.3965952086134999, + "learning_rate": 5.538478421108119e-05, + "loss": 0.8882, + "step": 4819 + }, + { + "epoch": 2.184950135992747, + "grad_norm": 0.4080709324690937, + "learning_rate": 5.5373098435701423e-05, + "loss": 0.8738, + "step": 4820 + }, + { + "epoch": 2.185403445149592, + "grad_norm": 0.4405543627317717, + "learning_rate": 5.5361411120660675e-05, + "loss": 0.8825, + "step": 4821 + }, + { + "epoch": 2.185856754306437, + "grad_norm": 0.415259713788134, + "learning_rate": 5.5349722267129455e-05, + "loss": 0.861, + "step": 4822 + }, + { + "epoch": 2.186310063463282, + "grad_norm": 0.35550124372548864, + "learning_rate": 5.5338031876278436e-05, + "loss": 0.8708, + "step": 4823 + }, + { + "epoch": 2.1867633726201268, + "grad_norm": 0.3594161961836957, + "learning_rate": 5.532633994927845e-05, + "loss": 0.8779, + "step": 4824 + }, + { + "epoch": 2.187216681776972, + "grad_norm": 0.3564011810007226, + "learning_rate": 5.5314646487300473e-05, + "loss": 0.8517, + "step": 4825 + }, + { + "epoch": 2.187669990933817, + "grad_norm": 0.33741359935666837, + "learning_rate": 5.530295149151564e-05, + "loss": 0.8537, + "step": 4826 + }, + { + "epoch": 2.1881233000906617, + "grad_norm": 0.40117564555283325, + "learning_rate": 5.529125496309524e-05, + "loss": 0.8779, + "step": 4827 + }, + { + "epoch": 2.188576609247507, + "grad_norm": 0.46867077935827234, + "learning_rate": 5.5279556903210725e-05, + "loss": 0.8893, + "step": 4828 + }, + { + "epoch": 2.189029918404352, + "grad_norm": 0.39507909804924746, + "learning_rate": 5.526785731303367e-05, + "loss": 0.872, + "step": 4829 + }, + { + "epoch": 2.1894832275611966, + "grad_norm": 0.5596585036162312, + "learning_rate": 5.525615619373583e-05, + "loss": 0.8749, + "step": 4830 + }, + { + "epoch": 2.189936536718042, + "grad_norm": 0.2938861907191448, + "learning_rate": 5.524445354648912e-05, + "loss": 0.8793, + "step": 4831 + }, + { + "epoch": 2.1903898458748867, + "grad_norm": 0.40017601681920695, + "learning_rate": 5.523274937246557e-05, + "loss": 0.8931, + "step": 4832 + }, + { + "epoch": 2.1908431550317315, + "grad_norm": 0.49940516723268236, + "learning_rate": 5.522104367283742e-05, + "loss": 0.9138, + "step": 4833 + }, + { + "epoch": 2.191296464188577, + "grad_norm": 0.5377699275256324, + "learning_rate": 5.520933644877701e-05, + "loss": 0.8768, + "step": 4834 + }, + { + "epoch": 2.1917497733454216, + "grad_norm": 0.5701546305217987, + "learning_rate": 5.5197627701456846e-05, + "loss": 0.8608, + "step": 4835 + }, + { + "epoch": 2.1922030825022665, + "grad_norm": 0.6162881803333637, + "learning_rate": 5.51859174320496e-05, + "loss": 0.875, + "step": 4836 + }, + { + "epoch": 2.1926563916591117, + "grad_norm": 0.6438797643345749, + "learning_rate": 5.51742056417281e-05, + "loss": 0.863, + "step": 4837 + }, + { + "epoch": 2.1931097008159566, + "grad_norm": 0.6033240187624191, + "learning_rate": 5.51624923316653e-05, + "loss": 0.8768, + "step": 4838 + }, + { + "epoch": 2.1935630099728014, + "grad_norm": 0.5494829430286348, + "learning_rate": 5.5150777503034345e-05, + "loss": 0.8801, + "step": 4839 + }, + { + "epoch": 2.1940163191296462, + "grad_norm": 0.4889450959872193, + "learning_rate": 5.513906115700849e-05, + "loss": 0.8785, + "step": 4840 + }, + { + "epoch": 2.1944696282864915, + "grad_norm": 0.39439072312148554, + "learning_rate": 5.512734329476117e-05, + "loss": 0.8995, + "step": 4841 + }, + { + "epoch": 2.1949229374433363, + "grad_norm": 0.38075368928671915, + "learning_rate": 5.511562391746596e-05, + "loss": 0.8731, + "step": 4842 + }, + { + "epoch": 2.195376246600181, + "grad_norm": 0.39419548455520054, + "learning_rate": 5.51039030262966e-05, + "loss": 0.8958, + "step": 4843 + }, + { + "epoch": 2.1958295557570264, + "grad_norm": 0.4589942868912707, + "learning_rate": 5.509218062242694e-05, + "loss": 0.8834, + "step": 4844 + }, + { + "epoch": 2.1962828649138713, + "grad_norm": 0.47731084170935845, + "learning_rate": 5.5080456707031054e-05, + "loss": 0.8706, + "step": 4845 + }, + { + "epoch": 2.196736174070716, + "grad_norm": 0.48985711022665535, + "learning_rate": 5.506873128128309e-05, + "loss": 0.8815, + "step": 4846 + }, + { + "epoch": 2.1971894832275614, + "grad_norm": 0.607660396933598, + "learning_rate": 5.505700434635741e-05, + "loss": 0.8794, + "step": 4847 + }, + { + "epoch": 2.197642792384406, + "grad_norm": 0.6882828726813031, + "learning_rate": 5.50452759034285e-05, + "loss": 0.8919, + "step": 4848 + }, + { + "epoch": 2.198096101541251, + "grad_norm": 0.7007881845207465, + "learning_rate": 5.5033545953670984e-05, + "loss": 0.8722, + "step": 4849 + }, + { + "epoch": 2.1985494106980963, + "grad_norm": 0.6421018821114959, + "learning_rate": 5.502181449825966e-05, + "loss": 0.8809, + "step": 4850 + }, + { + "epoch": 2.199002719854941, + "grad_norm": 0.5989681950406002, + "learning_rate": 5.501008153836946e-05, + "loss": 0.8916, + "step": 4851 + }, + { + "epoch": 2.199456029011786, + "grad_norm": 0.5147421213357903, + "learning_rate": 5.499834707517549e-05, + "loss": 0.8939, + "step": 4852 + }, + { + "epoch": 2.199909338168631, + "grad_norm": 0.3650270761523685, + "learning_rate": 5.498661110985298e-05, + "loss": 0.883, + "step": 4853 + }, + { + "epoch": 2.200362647325476, + "grad_norm": 0.3325836703189744, + "learning_rate": 5.497487364357732e-05, + "loss": 0.8789, + "step": 4854 + }, + { + "epoch": 2.200815956482321, + "grad_norm": 0.5650967544389922, + "learning_rate": 5.496313467752405e-05, + "loss": 0.8475, + "step": 4855 + }, + { + "epoch": 2.2012692656391657, + "grad_norm": 0.6532400366740393, + "learning_rate": 5.495139421286889e-05, + "loss": 0.8645, + "step": 4856 + }, + { + "epoch": 2.201722574796011, + "grad_norm": 0.6105080788171893, + "learning_rate": 5.4939652250787625e-05, + "loss": 0.8839, + "step": 4857 + }, + { + "epoch": 2.202175883952856, + "grad_norm": 0.5501098847107261, + "learning_rate": 5.49279087924563e-05, + "loss": 0.8844, + "step": 4858 + }, + { + "epoch": 2.2026291931097006, + "grad_norm": 0.470705631292352, + "learning_rate": 5.4916163839051024e-05, + "loss": 0.8895, + "step": 4859 + }, + { + "epoch": 2.203082502266546, + "grad_norm": 0.36550366431691733, + "learning_rate": 5.490441739174811e-05, + "loss": 0.8784, + "step": 4860 + }, + { + "epoch": 2.2035358114233907, + "grad_norm": 0.38441660376650016, + "learning_rate": 5.4892669451723984e-05, + "loss": 0.8825, + "step": 4861 + }, + { + "epoch": 2.2039891205802356, + "grad_norm": 0.4284247823276656, + "learning_rate": 5.488092002015526e-05, + "loss": 0.8823, + "step": 4862 + }, + { + "epoch": 2.204442429737081, + "grad_norm": 0.37614690131027845, + "learning_rate": 5.4869169098218656e-05, + "loss": 0.9098, + "step": 4863 + }, + { + "epoch": 2.2048957388939256, + "grad_norm": 0.3365368988372933, + "learning_rate": 5.4857416687091055e-05, + "loss": 0.8842, + "step": 4864 + }, + { + "epoch": 2.2053490480507705, + "grad_norm": 0.28962398927359784, + "learning_rate": 5.484566278794951e-05, + "loss": 0.8915, + "step": 4865 + }, + { + "epoch": 2.2058023572076157, + "grad_norm": 0.29212648277849734, + "learning_rate": 5.483390740197122e-05, + "loss": 0.8767, + "step": 4866 + }, + { + "epoch": 2.2062556663644606, + "grad_norm": 0.3496485369191403, + "learning_rate": 5.482215053033349e-05, + "loss": 0.8591, + "step": 4867 + }, + { + "epoch": 2.2067089755213054, + "grad_norm": 0.38541951551843834, + "learning_rate": 5.481039217421384e-05, + "loss": 0.8641, + "step": 4868 + }, + { + "epoch": 2.2071622846781507, + "grad_norm": 0.36492493924533803, + "learning_rate": 5.4798632334789876e-05, + "loss": 0.8844, + "step": 4869 + }, + { + "epoch": 2.2076155938349955, + "grad_norm": 0.3083565460945788, + "learning_rate": 5.478687101323939e-05, + "loss": 0.8788, + "step": 4870 + }, + { + "epoch": 2.2080689029918403, + "grad_norm": 0.3397278867109425, + "learning_rate": 5.477510821074032e-05, + "loss": 0.8868, + "step": 4871 + }, + { + "epoch": 2.2085222121486856, + "grad_norm": 0.37721535436489784, + "learning_rate": 5.476334392847074e-05, + "loss": 0.8673, + "step": 4872 + }, + { + "epoch": 2.2089755213055304, + "grad_norm": 0.365135371873888, + "learning_rate": 5.475157816760888e-05, + "loss": 0.8431, + "step": 4873 + }, + { + "epoch": 2.2094288304623753, + "grad_norm": 0.3674202507657544, + "learning_rate": 5.4739810929333096e-05, + "loss": 0.8823, + "step": 4874 + }, + { + "epoch": 2.2098821396192205, + "grad_norm": 0.36957693334447705, + "learning_rate": 5.4728042214821945e-05, + "loss": 0.8599, + "step": 4875 + }, + { + "epoch": 2.2103354487760654, + "grad_norm": 0.32938146526170925, + "learning_rate": 5.471627202525407e-05, + "loss": 0.8802, + "step": 4876 + }, + { + "epoch": 2.21078875793291, + "grad_norm": 0.3811534035621776, + "learning_rate": 5.4704500361808305e-05, + "loss": 0.8635, + "step": 4877 + }, + { + "epoch": 2.211242067089755, + "grad_norm": 0.45016829230187694, + "learning_rate": 5.469272722566361e-05, + "loss": 0.8676, + "step": 4878 + }, + { + "epoch": 2.2116953762466003, + "grad_norm": 0.4885957197720917, + "learning_rate": 5.468095261799911e-05, + "loss": 0.8847, + "step": 4879 + }, + { + "epoch": 2.212148685403445, + "grad_norm": 0.5366909491823606, + "learning_rate": 5.466917653999405e-05, + "loss": 0.8701, + "step": 4880 + }, + { + "epoch": 2.21260199456029, + "grad_norm": 0.6627013125417538, + "learning_rate": 5.465739899282787e-05, + "loss": 0.8785, + "step": 4881 + }, + { + "epoch": 2.213055303717135, + "grad_norm": 0.6775193103580031, + "learning_rate": 5.464561997768008e-05, + "loss": 0.8761, + "step": 4882 + }, + { + "epoch": 2.21350861287398, + "grad_norm": 0.6383309297624936, + "learning_rate": 5.4633839495730415e-05, + "loss": 0.8689, + "step": 4883 + }, + { + "epoch": 2.213961922030825, + "grad_norm": 0.5389244039090572, + "learning_rate": 5.462205754815871e-05, + "loss": 0.8713, + "step": 4884 + }, + { + "epoch": 2.21441523118767, + "grad_norm": 0.4703349692916239, + "learning_rate": 5.461027413614497e-05, + "loss": 0.8671, + "step": 4885 + }, + { + "epoch": 2.214868540344515, + "grad_norm": 0.44997515536246147, + "learning_rate": 5.459848926086935e-05, + "loss": 0.8547, + "step": 4886 + }, + { + "epoch": 2.21532184950136, + "grad_norm": 0.4619712800384572, + "learning_rate": 5.458670292351211e-05, + "loss": 0.8902, + "step": 4887 + }, + { + "epoch": 2.215775158658205, + "grad_norm": 0.41683794651439404, + "learning_rate": 5.457491512525371e-05, + "loss": 0.8572, + "step": 4888 + }, + { + "epoch": 2.21622846781505, + "grad_norm": 0.34225815762094686, + "learning_rate": 5.456312586727472e-05, + "loss": 0.88, + "step": 4889 + }, + { + "epoch": 2.2166817769718947, + "grad_norm": 0.30786849634634705, + "learning_rate": 5.455133515075588e-05, + "loss": 0.8894, + "step": 4890 + }, + { + "epoch": 2.21713508612874, + "grad_norm": 0.3309659796576903, + "learning_rate": 5.453954297687806e-05, + "loss": 0.8764, + "step": 4891 + }, + { + "epoch": 2.217588395285585, + "grad_norm": 0.31213172977179493, + "learning_rate": 5.452774934682227e-05, + "loss": 0.87, + "step": 4892 + }, + { + "epoch": 2.2180417044424297, + "grad_norm": 0.3073605637593185, + "learning_rate": 5.451595426176969e-05, + "loss": 0.8751, + "step": 4893 + }, + { + "epoch": 2.2184950135992745, + "grad_norm": 0.3294403843260236, + "learning_rate": 5.450415772290163e-05, + "loss": 0.8619, + "step": 4894 + }, + { + "epoch": 2.2189483227561198, + "grad_norm": 1.2612348005615244, + "learning_rate": 5.4492359731399555e-05, + "loss": 0.8881, + "step": 4895 + }, + { + "epoch": 2.2194016319129646, + "grad_norm": 0.3016473336579799, + "learning_rate": 5.4480560288445055e-05, + "loss": 0.8895, + "step": 4896 + }, + { + "epoch": 2.2198549410698094, + "grad_norm": 0.4156149990700991, + "learning_rate": 5.446875939521988e-05, + "loss": 0.8648, + "step": 4897 + }, + { + "epoch": 2.2203082502266547, + "grad_norm": 1.1396226462424783, + "learning_rate": 5.445695705290594e-05, + "loss": 0.887, + "step": 4898 + }, + { + "epoch": 2.2207615593834995, + "grad_norm": 0.32799083645265864, + "learning_rate": 5.4445153262685256e-05, + "loss": 0.8921, + "step": 4899 + }, + { + "epoch": 2.2212148685403443, + "grad_norm": 0.35448241173431644, + "learning_rate": 5.4433348025740014e-05, + "loss": 0.866, + "step": 4900 + }, + { + "epoch": 2.2216681776971896, + "grad_norm": 0.44753793815185605, + "learning_rate": 5.4421541343252566e-05, + "loss": 0.8883, + "step": 4901 + }, + { + "epoch": 2.2221214868540344, + "grad_norm": 0.5715928967573293, + "learning_rate": 5.4409733216405356e-05, + "loss": 0.8829, + "step": 4902 + }, + { + "epoch": 2.2225747960108793, + "grad_norm": 0.6058075440479654, + "learning_rate": 5.439792364638101e-05, + "loss": 0.8679, + "step": 4903 + }, + { + "epoch": 2.2230281051677245, + "grad_norm": 0.6653213051649463, + "learning_rate": 5.4386112634362304e-05, + "loss": 0.8708, + "step": 4904 + }, + { + "epoch": 2.2234814143245694, + "grad_norm": 0.7344331929762148, + "learning_rate": 5.4374300181532136e-05, + "loss": 0.8672, + "step": 4905 + }, + { + "epoch": 2.223934723481414, + "grad_norm": 0.7616796545505374, + "learning_rate": 5.436248628907355e-05, + "loss": 0.8621, + "step": 4906 + }, + { + "epoch": 2.2243880326382595, + "grad_norm": 0.7064080089532961, + "learning_rate": 5.435067095816976e-05, + "loss": 0.9048, + "step": 4907 + }, + { + "epoch": 2.2248413417951043, + "grad_norm": 0.6728689050858431, + "learning_rate": 5.43388541900041e-05, + "loss": 0.8787, + "step": 4908 + }, + { + "epoch": 2.225294650951949, + "grad_norm": 0.6280895524318776, + "learning_rate": 5.432703598576004e-05, + "loss": 0.8736, + "step": 4909 + }, + { + "epoch": 2.2257479601087944, + "grad_norm": 0.5005726675408083, + "learning_rate": 5.4315216346621246e-05, + "loss": 0.8987, + "step": 4910 + }, + { + "epoch": 2.226201269265639, + "grad_norm": 0.34535303920323235, + "learning_rate": 5.430339527377144e-05, + "loss": 0.8777, + "step": 4911 + }, + { + "epoch": 2.226654578422484, + "grad_norm": 0.32362624671160434, + "learning_rate": 5.429157276839457e-05, + "loss": 0.8681, + "step": 4912 + }, + { + "epoch": 2.2271078875793293, + "grad_norm": 0.3635228370634084, + "learning_rate": 5.427974883167468e-05, + "loss": 0.9012, + "step": 4913 + }, + { + "epoch": 2.227561196736174, + "grad_norm": 0.41319362980813745, + "learning_rate": 5.426792346479598e-05, + "loss": 0.8638, + "step": 4914 + }, + { + "epoch": 2.228014505893019, + "grad_norm": 0.42124915693877135, + "learning_rate": 5.425609666894281e-05, + "loss": 0.8889, + "step": 4915 + }, + { + "epoch": 2.2284678150498642, + "grad_norm": 0.38978539297402287, + "learning_rate": 5.4244268445299665e-05, + "loss": 0.8777, + "step": 4916 + }, + { + "epoch": 2.228921124206709, + "grad_norm": 0.4163926497266152, + "learning_rate": 5.423243879505117e-05, + "loss": 0.8719, + "step": 4917 + }, + { + "epoch": 2.229374433363554, + "grad_norm": 0.44502219985697333, + "learning_rate": 5.42206077193821e-05, + "loss": 0.8751, + "step": 4918 + }, + { + "epoch": 2.2298277425203987, + "grad_norm": 0.4479755397982564, + "learning_rate": 5.420877521947737e-05, + "loss": 0.891, + "step": 4919 + }, + { + "epoch": 2.230281051677244, + "grad_norm": 0.4852826374781826, + "learning_rate": 5.419694129652206e-05, + "loss": 0.8852, + "step": 4920 + }, + { + "epoch": 2.230734360834089, + "grad_norm": 0.607857588408353, + "learning_rate": 5.4185105951701335e-05, + "loss": 0.892, + "step": 4921 + }, + { + "epoch": 2.2311876699909337, + "grad_norm": 0.6924000578649511, + "learning_rate": 5.417326918620056e-05, + "loss": 0.8921, + "step": 4922 + }, + { + "epoch": 2.231640979147779, + "grad_norm": 0.7452563882033776, + "learning_rate": 5.416143100120523e-05, + "loss": 0.8925, + "step": 4923 + }, + { + "epoch": 2.2320942883046238, + "grad_norm": 0.82242660526943, + "learning_rate": 5.414959139790096e-05, + "loss": 0.8821, + "step": 4924 + }, + { + "epoch": 2.2325475974614686, + "grad_norm": 0.8230353651301163, + "learning_rate": 5.4137750377473536e-05, + "loss": 0.8904, + "step": 4925 + }, + { + "epoch": 2.233000906618314, + "grad_norm": 0.7552090297213019, + "learning_rate": 5.412590794110886e-05, + "loss": 0.8871, + "step": 4926 + }, + { + "epoch": 2.2334542157751587, + "grad_norm": 0.7666558954421583, + "learning_rate": 5.411406408999298e-05, + "loss": 0.8866, + "step": 4927 + }, + { + "epoch": 2.2339075249320035, + "grad_norm": 0.7119450769112671, + "learning_rate": 5.41022188253121e-05, + "loss": 0.8682, + "step": 4928 + }, + { + "epoch": 2.234360834088849, + "grad_norm": 0.5447554275183544, + "learning_rate": 5.4090372148252575e-05, + "loss": 0.8736, + "step": 4929 + }, + { + "epoch": 2.2348141432456936, + "grad_norm": 0.3238200196527693, + "learning_rate": 5.407852406000086e-05, + "loss": 0.8749, + "step": 4930 + }, + { + "epoch": 2.2352674524025384, + "grad_norm": 0.3463741627888589, + "learning_rate": 5.406667456174358e-05, + "loss": 0.8747, + "step": 4931 + }, + { + "epoch": 2.2357207615593837, + "grad_norm": 0.5363215260087922, + "learning_rate": 5.40548236546675e-05, + "loss": 0.8891, + "step": 4932 + }, + { + "epoch": 2.2361740707162285, + "grad_norm": 0.5923159149579007, + "learning_rate": 5.4042971339959524e-05, + "loss": 0.8766, + "step": 4933 + }, + { + "epoch": 2.2366273798730734, + "grad_norm": 0.6300206713351357, + "learning_rate": 5.40311176188067e-05, + "loss": 0.8588, + "step": 4934 + }, + { + "epoch": 2.237080689029918, + "grad_norm": 0.6557782332926844, + "learning_rate": 5.401926249239621e-05, + "loss": 0.8921, + "step": 4935 + }, + { + "epoch": 2.2375339981867635, + "grad_norm": 0.6567345814318327, + "learning_rate": 5.400740596191538e-05, + "loss": 0.8785, + "step": 4936 + }, + { + "epoch": 2.2379873073436083, + "grad_norm": 0.5290324369527498, + "learning_rate": 5.399554802855167e-05, + "loss": 0.8859, + "step": 4937 + }, + { + "epoch": 2.238440616500453, + "grad_norm": 0.41477467026654946, + "learning_rate": 5.39836886934927e-05, + "loss": 0.8857, + "step": 4938 + }, + { + "epoch": 2.2388939256572984, + "grad_norm": 0.35049802743413166, + "learning_rate": 5.3971827957926214e-05, + "loss": 0.8774, + "step": 4939 + }, + { + "epoch": 2.239347234814143, + "grad_norm": 0.2967265134297082, + "learning_rate": 5.395996582304009e-05, + "loss": 0.8848, + "step": 4940 + }, + { + "epoch": 2.239800543970988, + "grad_norm": 0.22962253309348635, + "learning_rate": 5.394810229002237e-05, + "loss": 0.8851, + "step": 4941 + }, + { + "epoch": 2.2402538531278333, + "grad_norm": 0.33519773094244487, + "learning_rate": 5.3936237360061205e-05, + "loss": 0.8742, + "step": 4942 + }, + { + "epoch": 2.240707162284678, + "grad_norm": 0.4767309593516099, + "learning_rate": 5.392437103434491e-05, + "loss": 0.8825, + "step": 4943 + }, + { + "epoch": 2.241160471441523, + "grad_norm": 0.5325117996228746, + "learning_rate": 5.391250331406193e-05, + "loss": 0.8933, + "step": 4944 + }, + { + "epoch": 2.2416137805983682, + "grad_norm": 0.6125308357309032, + "learning_rate": 5.3900634200400856e-05, + "loss": 0.8836, + "step": 4945 + }, + { + "epoch": 2.242067089755213, + "grad_norm": 0.6908711113165782, + "learning_rate": 5.388876369455041e-05, + "loss": 0.8816, + "step": 4946 + }, + { + "epoch": 2.242520398912058, + "grad_norm": 0.7569454476668338, + "learning_rate": 5.3876891797699466e-05, + "loss": 0.8945, + "step": 4947 + }, + { + "epoch": 2.242973708068903, + "grad_norm": 0.7544117866772967, + "learning_rate": 5.386501851103702e-05, + "loss": 0.8674, + "step": 4948 + }, + { + "epoch": 2.243427017225748, + "grad_norm": 0.6317782282933646, + "learning_rate": 5.385314383575225e-05, + "loss": 0.8674, + "step": 4949 + }, + { + "epoch": 2.243880326382593, + "grad_norm": 0.4121317320200252, + "learning_rate": 5.384126777303438e-05, + "loss": 0.8844, + "step": 4950 + }, + { + "epoch": 2.244333635539438, + "grad_norm": 0.32722108121801696, + "learning_rate": 5.382939032407288e-05, + "loss": 0.8634, + "step": 4951 + }, + { + "epoch": 2.244786944696283, + "grad_norm": 0.23615664452310747, + "learning_rate": 5.381751149005729e-05, + "loss": 0.8788, + "step": 4952 + }, + { + "epoch": 2.2452402538531278, + "grad_norm": 0.29805853687512585, + "learning_rate": 5.3805631272177304e-05, + "loss": 0.8842, + "step": 4953 + }, + { + "epoch": 2.245693563009973, + "grad_norm": 0.4153851683419297, + "learning_rate": 5.379374967162279e-05, + "loss": 0.8788, + "step": 4954 + }, + { + "epoch": 2.246146872166818, + "grad_norm": 0.4987653761799615, + "learning_rate": 5.378186668958369e-05, + "loss": 0.8688, + "step": 4955 + }, + { + "epoch": 2.2466001813236627, + "grad_norm": 0.5221076784871495, + "learning_rate": 5.3769982327250155e-05, + "loss": 0.8782, + "step": 4956 + }, + { + "epoch": 2.2470534904805075, + "grad_norm": 0.47849718252459017, + "learning_rate": 5.37580965858124e-05, + "loss": 0.8831, + "step": 4957 + }, + { + "epoch": 2.247506799637353, + "grad_norm": 0.41440229592063843, + "learning_rate": 5.374620946646086e-05, + "loss": 0.8818, + "step": 4958 + }, + { + "epoch": 2.2479601087941976, + "grad_norm": 0.415509843055896, + "learning_rate": 5.3734320970386036e-05, + "loss": 0.8804, + "step": 4959 + }, + { + "epoch": 2.2484134179510424, + "grad_norm": 0.416951294917652, + "learning_rate": 5.3722431098778594e-05, + "loss": 0.8831, + "step": 4960 + }, + { + "epoch": 2.2488667271078877, + "grad_norm": 0.4081541359955584, + "learning_rate": 5.371053985282935e-05, + "loss": 0.8703, + "step": 4961 + }, + { + "epoch": 2.2493200362647325, + "grad_norm": 0.3852870745335183, + "learning_rate": 5.3698647233729245e-05, + "loss": 0.8879, + "step": 4962 + }, + { + "epoch": 2.2497733454215774, + "grad_norm": 0.3711964793447073, + "learning_rate": 5.368675324266936e-05, + "loss": 0.9028, + "step": 4963 + }, + { + "epoch": 2.2502266545784226, + "grad_norm": 0.3832133757472111, + "learning_rate": 5.367485788084091e-05, + "loss": 0.8835, + "step": 4964 + }, + { + "epoch": 2.2506799637352675, + "grad_norm": 0.3499184474090866, + "learning_rate": 5.3662961149435265e-05, + "loss": 0.8827, + "step": 4965 + }, + { + "epoch": 2.2511332728921123, + "grad_norm": 0.3505847681953082, + "learning_rate": 5.36510630496439e-05, + "loss": 0.8664, + "step": 4966 + }, + { + "epoch": 2.2515865820489576, + "grad_norm": 0.3875724591996002, + "learning_rate": 5.363916358265846e-05, + "loss": 0.8853, + "step": 4967 + }, + { + "epoch": 2.2520398912058024, + "grad_norm": 0.4423798690626043, + "learning_rate": 5.362726274967069e-05, + "loss": 0.8901, + "step": 4968 + }, + { + "epoch": 2.252493200362647, + "grad_norm": 0.44266919535192817, + "learning_rate": 5.361536055187251e-05, + "loss": 0.8635, + "step": 4969 + }, + { + "epoch": 2.252946509519492, + "grad_norm": 0.3812820252713256, + "learning_rate": 5.3603456990455956e-05, + "loss": 0.8759, + "step": 4970 + }, + { + "epoch": 2.2533998186763373, + "grad_norm": 0.33979678078742415, + "learning_rate": 5.359155206661321e-05, + "loss": 0.875, + "step": 4971 + }, + { + "epoch": 2.253853127833182, + "grad_norm": 2.487346031876852, + "learning_rate": 5.3579645781536566e-05, + "loss": 0.8976, + "step": 4972 + }, + { + "epoch": 2.254306436990027, + "grad_norm": 0.45851618394546884, + "learning_rate": 5.35677381364185e-05, + "loss": 0.8839, + "step": 4973 + }, + { + "epoch": 2.2547597461468722, + "grad_norm": 0.8555872411749581, + "learning_rate": 5.355582913245157e-05, + "loss": 0.869, + "step": 4974 + }, + { + "epoch": 2.255213055303717, + "grad_norm": 1.1815367029521169, + "learning_rate": 5.354391877082853e-05, + "loss": 0.8614, + "step": 4975 + }, + { + "epoch": 2.255666364460562, + "grad_norm": 0.7864897484248808, + "learning_rate": 5.353200705274221e-05, + "loss": 0.8865, + "step": 4976 + }, + { + "epoch": 2.256119673617407, + "grad_norm": 0.5614992132989326, + "learning_rate": 5.3520093979385617e-05, + "loss": 0.883, + "step": 4977 + }, + { + "epoch": 2.256572982774252, + "grad_norm": 0.6492019233691656, + "learning_rate": 5.3508179551951876e-05, + "loss": 0.8888, + "step": 4978 + }, + { + "epoch": 2.257026291931097, + "grad_norm": 1.0574370325620897, + "learning_rate": 5.349626377163425e-05, + "loss": 0.8768, + "step": 4979 + }, + { + "epoch": 2.257479601087942, + "grad_norm": 0.9690763141578795, + "learning_rate": 5.348434663962614e-05, + "loss": 0.8331, + "step": 4980 + }, + { + "epoch": 2.257932910244787, + "grad_norm": 0.6645626032028035, + "learning_rate": 5.347242815712109e-05, + "loss": 0.8781, + "step": 4981 + }, + { + "epoch": 2.2583862194016318, + "grad_norm": 0.5950176446511587, + "learning_rate": 5.346050832531277e-05, + "loss": 0.8603, + "step": 4982 + }, + { + "epoch": 2.258839528558477, + "grad_norm": 0.6330200219431598, + "learning_rate": 5.344858714539499e-05, + "loss": 0.8952, + "step": 4983 + }, + { + "epoch": 2.259292837715322, + "grad_norm": 0.7316350649474501, + "learning_rate": 5.343666461856166e-05, + "loss": 0.8925, + "step": 4984 + }, + { + "epoch": 2.2597461468721667, + "grad_norm": 0.8747452672951347, + "learning_rate": 5.342474074600689e-05, + "loss": 0.8647, + "step": 4985 + }, + { + "epoch": 2.260199456029012, + "grad_norm": 0.8297700718467051, + "learning_rate": 5.341281552892487e-05, + "loss": 0.8622, + "step": 4986 + }, + { + "epoch": 2.260652765185857, + "grad_norm": 0.7267365484923471, + "learning_rate": 5.3400888968509974e-05, + "loss": 0.8739, + "step": 4987 + }, + { + "epoch": 2.2611060743427016, + "grad_norm": 0.5779867988548791, + "learning_rate": 5.338896106595665e-05, + "loss": 0.8661, + "step": 4988 + }, + { + "epoch": 2.261559383499547, + "grad_norm": 0.467972646611008, + "learning_rate": 5.337703182245953e-05, + "loss": 0.8773, + "step": 4989 + }, + { + "epoch": 2.2620126926563917, + "grad_norm": 0.3899872243002075, + "learning_rate": 5.336510123921336e-05, + "loss": 0.8659, + "step": 4990 + }, + { + "epoch": 2.2624660018132365, + "grad_norm": 0.4380264150723595, + "learning_rate": 5.335316931741301e-05, + "loss": 0.8726, + "step": 4991 + }, + { + "epoch": 2.262919310970082, + "grad_norm": 0.6012968192534384, + "learning_rate": 5.334123605825352e-05, + "loss": 0.888, + "step": 4992 + }, + { + "epoch": 2.2633726201269266, + "grad_norm": 1.020586235276103, + "learning_rate": 5.332930146293002e-05, + "loss": 0.9089, + "step": 4993 + }, + { + "epoch": 2.2638259292837715, + "grad_norm": 0.5260590922687005, + "learning_rate": 5.3317365532637814e-05, + "loss": 0.8731, + "step": 4994 + }, + { + "epoch": 2.2642792384406167, + "grad_norm": 0.49590162826902573, + "learning_rate": 5.3305428268572304e-05, + "loss": 0.8688, + "step": 4995 + }, + { + "epoch": 2.2647325475974616, + "grad_norm": 0.5343244651127241, + "learning_rate": 5.329348967192906e-05, + "loss": 0.9023, + "step": 4996 + }, + { + "epoch": 2.2651858567543064, + "grad_norm": 0.5207452556644578, + "learning_rate": 5.3281549743903754e-05, + "loss": 0.8855, + "step": 4997 + }, + { + "epoch": 2.265639165911151, + "grad_norm": 0.5151874219206892, + "learning_rate": 5.3269608485692206e-05, + "loss": 0.8741, + "step": 4998 + }, + { + "epoch": 2.2660924750679965, + "grad_norm": 0.5615767697135656, + "learning_rate": 5.325766589849036e-05, + "loss": 0.8755, + "step": 4999 + }, + { + "epoch": 2.2665457842248413, + "grad_norm": 0.4834146583884921, + "learning_rate": 5.324572198349432e-05, + "loss": 0.8712, + "step": 5000 + }, + { + "epoch": 2.266999093381686, + "grad_norm": 0.39315507840825753, + "learning_rate": 5.3233776741900295e-05, + "loss": 0.8703, + "step": 5001 + }, + { + "epoch": 2.2674524025385314, + "grad_norm": 0.5277356854965118, + "learning_rate": 5.3221830174904634e-05, + "loss": 0.8704, + "step": 5002 + }, + { + "epoch": 2.2679057116953762, + "grad_norm": 0.5633057270396042, + "learning_rate": 5.320988228370382e-05, + "loss": 0.8726, + "step": 5003 + }, + { + "epoch": 2.268359020852221, + "grad_norm": 0.5042271942740735, + "learning_rate": 5.319793306949448e-05, + "loss": 0.8881, + "step": 5004 + }, + { + "epoch": 2.2688123300090663, + "grad_norm": 0.5316450656043279, + "learning_rate": 5.3185982533473344e-05, + "loss": 0.8847, + "step": 5005 + }, + { + "epoch": 2.269265639165911, + "grad_norm": 0.5677382587044464, + "learning_rate": 5.3174030676837314e-05, + "loss": 0.8896, + "step": 5006 + }, + { + "epoch": 2.269718948322756, + "grad_norm": 0.5735441498328504, + "learning_rate": 5.316207750078339e-05, + "loss": 0.8837, + "step": 5007 + }, + { + "epoch": 2.2701722574796013, + "grad_norm": 0.47065583260972177, + "learning_rate": 5.315012300650871e-05, + "loss": 0.8665, + "step": 5008 + }, + { + "epoch": 2.270625566636446, + "grad_norm": 0.39232046690221356, + "learning_rate": 5.3138167195210576e-05, + "loss": 0.8996, + "step": 5009 + }, + { + "epoch": 2.271078875793291, + "grad_norm": 0.38162674304494193, + "learning_rate": 5.3126210068086386e-05, + "loss": 0.8715, + "step": 5010 + }, + { + "epoch": 2.2715321849501358, + "grad_norm": 0.4132927202854033, + "learning_rate": 5.311425162633368e-05, + "loss": 0.8891, + "step": 5011 + }, + { + "epoch": 2.271985494106981, + "grad_norm": 0.4571105658114805, + "learning_rate": 5.3102291871150134e-05, + "loss": 0.8715, + "step": 5012 + }, + { + "epoch": 2.272438803263826, + "grad_norm": 0.5173474999992865, + "learning_rate": 5.309033080373354e-05, + "loss": 0.8817, + "step": 5013 + }, + { + "epoch": 2.2728921124206707, + "grad_norm": 0.417367623325475, + "learning_rate": 5.3078368425281855e-05, + "loss": 0.8755, + "step": 5014 + }, + { + "epoch": 2.273345421577516, + "grad_norm": 0.2595865145838421, + "learning_rate": 5.306640473699313e-05, + "loss": 0.8888, + "step": 5015 + }, + { + "epoch": 2.273798730734361, + "grad_norm": 0.316100029062705, + "learning_rate": 5.305443974006557e-05, + "loss": 0.8963, + "step": 5016 + }, + { + "epoch": 2.2742520398912056, + "grad_norm": 0.3947235730222884, + "learning_rate": 5.304247343569751e-05, + "loss": 0.8683, + "step": 5017 + }, + { + "epoch": 2.274705349048051, + "grad_norm": 0.38590778016413424, + "learning_rate": 5.3030505825087394e-05, + "loss": 0.8741, + "step": 5018 + }, + { + "epoch": 2.2751586582048957, + "grad_norm": 0.42551502453073575, + "learning_rate": 5.301853690943382e-05, + "loss": 0.8907, + "step": 5019 + }, + { + "epoch": 2.2756119673617405, + "grad_norm": 0.4713301597637494, + "learning_rate": 5.3006566689935524e-05, + "loss": 0.8921, + "step": 5020 + }, + { + "epoch": 2.276065276518586, + "grad_norm": 0.4397610984079893, + "learning_rate": 5.299459516779134e-05, + "loss": 0.8688, + "step": 5021 + }, + { + "epoch": 2.2765185856754306, + "grad_norm": 0.417439130092217, + "learning_rate": 5.2982622344200264e-05, + "loss": 0.8835, + "step": 5022 + }, + { + "epoch": 2.2769718948322755, + "grad_norm": 0.4330917656722008, + "learning_rate": 5.297064822036141e-05, + "loss": 0.8806, + "step": 5023 + }, + { + "epoch": 2.2774252039891207, + "grad_norm": 0.4318202238973824, + "learning_rate": 5.295867279747402e-05, + "loss": 0.8832, + "step": 5024 + }, + { + "epoch": 2.2778785131459656, + "grad_norm": 0.42957641773363203, + "learning_rate": 5.294669607673745e-05, + "loss": 0.8717, + "step": 5025 + }, + { + "epoch": 2.2783318223028104, + "grad_norm": 0.44013117780591704, + "learning_rate": 5.2934718059351235e-05, + "loss": 0.869, + "step": 5026 + }, + { + "epoch": 2.2787851314596557, + "grad_norm": 0.38962852284880717, + "learning_rate": 5.292273874651498e-05, + "loss": 0.8769, + "step": 5027 + }, + { + "epoch": 2.2792384406165005, + "grad_norm": 0.31063958437132627, + "learning_rate": 5.2910758139428474e-05, + "loss": 0.8811, + "step": 5028 + }, + { + "epoch": 2.2796917497733453, + "grad_norm": 0.30005275513618057, + "learning_rate": 5.2898776239291575e-05, + "loss": 0.8776, + "step": 5029 + }, + { + "epoch": 2.2801450589301906, + "grad_norm": 0.3319341766369852, + "learning_rate": 5.2886793047304334e-05, + "loss": 0.8769, + "step": 5030 + }, + { + "epoch": 2.2805983680870354, + "grad_norm": 0.36543539944555237, + "learning_rate": 5.2874808564666904e-05, + "loss": 0.8706, + "step": 5031 + }, + { + "epoch": 2.2810516772438802, + "grad_norm": 0.44308961338060404, + "learning_rate": 5.286282279257955e-05, + "loss": 0.8768, + "step": 5032 + }, + { + "epoch": 2.2815049864007255, + "grad_norm": 0.9516246451283887, + "learning_rate": 5.2850835732242684e-05, + "loss": 0.8962, + "step": 5033 + }, + { + "epoch": 2.2819582955575703, + "grad_norm": 0.48689993239890095, + "learning_rate": 5.283884738485686e-05, + "loss": 0.8887, + "step": 5034 + }, + { + "epoch": 2.282411604714415, + "grad_norm": 0.5583619813744088, + "learning_rate": 5.282685775162275e-05, + "loss": 0.876, + "step": 5035 + }, + { + "epoch": 2.2828649138712604, + "grad_norm": 0.6366590301385431, + "learning_rate": 5.2814866833741125e-05, + "loss": 0.865, + "step": 5036 + }, + { + "epoch": 2.2833182230281053, + "grad_norm": 0.6301623118638524, + "learning_rate": 5.280287463241292e-05, + "loss": 0.8816, + "step": 5037 + }, + { + "epoch": 2.28377153218495, + "grad_norm": 0.5670393667917114, + "learning_rate": 5.2790881148839204e-05, + "loss": 0.8532, + "step": 5038 + }, + { + "epoch": 2.284224841341795, + "grad_norm": 0.5403972120161724, + "learning_rate": 5.2778886384221144e-05, + "loss": 0.8797, + "step": 5039 + }, + { + "epoch": 2.28467815049864, + "grad_norm": 0.46459521065202003, + "learning_rate": 5.276689033976006e-05, + "loss": 0.869, + "step": 5040 + }, + { + "epoch": 2.285131459655485, + "grad_norm": 0.39090296406178476, + "learning_rate": 5.2754893016657384e-05, + "loss": 0.8773, + "step": 5041 + }, + { + "epoch": 2.28558476881233, + "grad_norm": 0.2765060102428211, + "learning_rate": 5.274289441611469e-05, + "loss": 0.8746, + "step": 5042 + }, + { + "epoch": 2.286038077969175, + "grad_norm": 0.3069440243159633, + "learning_rate": 5.2730894539333664e-05, + "loss": 0.8835, + "step": 5043 + }, + { + "epoch": 2.28649138712602, + "grad_norm": 0.36096209305210125, + "learning_rate": 5.271889338751614e-05, + "loss": 0.8646, + "step": 5044 + }, + { + "epoch": 2.286944696282865, + "grad_norm": 0.32083005698980943, + "learning_rate": 5.270689096186406e-05, + "loss": 0.8742, + "step": 5045 + }, + { + "epoch": 2.28739800543971, + "grad_norm": 0.3637239871108701, + "learning_rate": 5.269488726357951e-05, + "loss": 0.8841, + "step": 5046 + }, + { + "epoch": 2.287851314596555, + "grad_norm": 0.36993684467670773, + "learning_rate": 5.268288229386468e-05, + "loss": 0.8727, + "step": 5047 + }, + { + "epoch": 2.2883046237533997, + "grad_norm": 0.37134007578105926, + "learning_rate": 5.267087605392192e-05, + "loss": 0.8551, + "step": 5048 + }, + { + "epoch": 2.2887579329102445, + "grad_norm": 0.41088766816396255, + "learning_rate": 5.265886854495368e-05, + "loss": 0.8896, + "step": 5049 + }, + { + "epoch": 2.28921124206709, + "grad_norm": 0.4915116763631163, + "learning_rate": 5.264685976816256e-05, + "loss": 0.8777, + "step": 5050 + }, + { + "epoch": 2.2896645512239346, + "grad_norm": 0.5438113142793051, + "learning_rate": 5.263484972475126e-05, + "loss": 0.8719, + "step": 5051 + }, + { + "epoch": 2.2901178603807795, + "grad_norm": 0.5915494467176431, + "learning_rate": 5.262283841592263e-05, + "loss": 0.8604, + "step": 5052 + }, + { + "epoch": 2.2905711695376247, + "grad_norm": 0.5201504212570348, + "learning_rate": 5.2610825842879636e-05, + "loss": 0.8827, + "step": 5053 + }, + { + "epoch": 2.2910244786944696, + "grad_norm": 0.41428905958052575, + "learning_rate": 5.2598812006825366e-05, + "loss": 0.8794, + "step": 5054 + }, + { + "epoch": 2.2914777878513144, + "grad_norm": 0.3369609611247756, + "learning_rate": 5.2586796908963055e-05, + "loss": 0.8816, + "step": 5055 + }, + { + "epoch": 2.2919310970081597, + "grad_norm": 0.36689500741809444, + "learning_rate": 5.257478055049603e-05, + "loss": 0.8735, + "step": 5056 + }, + { + "epoch": 2.2923844061650045, + "grad_norm": 0.38848821142971757, + "learning_rate": 5.256276293262779e-05, + "loss": 0.8839, + "step": 5057 + }, + { + "epoch": 2.2928377153218493, + "grad_norm": 0.3313864539112403, + "learning_rate": 5.2550744056561904e-05, + "loss": 0.8573, + "step": 5058 + }, + { + "epoch": 2.2932910244786946, + "grad_norm": 0.2910929483517332, + "learning_rate": 5.253872392350213e-05, + "loss": 0.8861, + "step": 5059 + }, + { + "epoch": 2.2937443336355394, + "grad_norm": 0.34233108957687525, + "learning_rate": 5.25267025346523e-05, + "loss": 0.8938, + "step": 5060 + }, + { + "epoch": 2.2941976427923843, + "grad_norm": 0.3713730167492331, + "learning_rate": 5.2514679891216384e-05, + "loss": 0.8771, + "step": 5061 + }, + { + "epoch": 2.2946509519492295, + "grad_norm": 2.9756974546301507, + "learning_rate": 5.250265599439851e-05, + "loss": 0.8766, + "step": 5062 + }, + { + "epoch": 2.2951042611060744, + "grad_norm": 0.6053003089756437, + "learning_rate": 5.249063084540289e-05, + "loss": 0.8586, + "step": 5063 + }, + { + "epoch": 2.295557570262919, + "grad_norm": 1.0882111775977772, + "learning_rate": 5.2478604445433885e-05, + "loss": 0.8959, + "step": 5064 + }, + { + "epoch": 2.2960108794197644, + "grad_norm": 0.9063930065201687, + "learning_rate": 5.2466576795695965e-05, + "loss": 0.9087, + "step": 5065 + }, + { + "epoch": 2.2964641885766093, + "grad_norm": 0.5416927118470921, + "learning_rate": 5.2454547897393744e-05, + "loss": 0.8799, + "step": 5066 + }, + { + "epoch": 2.296917497733454, + "grad_norm": 0.6762791473839218, + "learning_rate": 5.244251775173195e-05, + "loss": 0.8739, + "step": 5067 + }, + { + "epoch": 2.2973708068902994, + "grad_norm": 0.9120377735183902, + "learning_rate": 5.243048635991544e-05, + "loss": 0.8791, + "step": 5068 + }, + { + "epoch": 2.297824116047144, + "grad_norm": 0.7657659791905322, + "learning_rate": 5.241845372314918e-05, + "loss": 0.8824, + "step": 5069 + }, + { + "epoch": 2.298277425203989, + "grad_norm": 0.4942964214378502, + "learning_rate": 5.240641984263828e-05, + "loss": 0.875, + "step": 5070 + }, + { + "epoch": 2.2987307343608343, + "grad_norm": 0.5268891362544752, + "learning_rate": 5.2394384719587974e-05, + "loss": 0.8812, + "step": 5071 + }, + { + "epoch": 2.299184043517679, + "grad_norm": 0.6815317915949027, + "learning_rate": 5.238234835520361e-05, + "loss": 0.8841, + "step": 5072 + }, + { + "epoch": 2.299637352674524, + "grad_norm": 0.653605196429628, + "learning_rate": 5.237031075069068e-05, + "loss": 0.8907, + "step": 5073 + }, + { + "epoch": 2.3000906618313692, + "grad_norm": 0.4980315299456739, + "learning_rate": 5.235827190725475e-05, + "loss": 0.8832, + "step": 5074 + }, + { + "epoch": 2.300543970988214, + "grad_norm": 0.5317021086097149, + "learning_rate": 5.2346231826101586e-05, + "loss": 0.8631, + "step": 5075 + }, + { + "epoch": 2.300997280145059, + "grad_norm": 0.5038684109979833, + "learning_rate": 5.2334190508437006e-05, + "loss": 0.8812, + "step": 5076 + }, + { + "epoch": 2.3014505893019037, + "grad_norm": 0.5189642923644942, + "learning_rate": 5.232214795546701e-05, + "loss": 0.8758, + "step": 5077 + }, + { + "epoch": 2.301903898458749, + "grad_norm": 0.5718839443439196, + "learning_rate": 5.231010416839768e-05, + "loss": 0.8757, + "step": 5078 + }, + { + "epoch": 2.302357207615594, + "grad_norm": 0.5558348345422731, + "learning_rate": 5.229805914843523e-05, + "loss": 0.8766, + "step": 5079 + }, + { + "epoch": 2.3028105167724386, + "grad_norm": 0.6172327843828324, + "learning_rate": 5.2286012896786016e-05, + "loss": 0.8917, + "step": 5080 + }, + { + "epoch": 2.303263825929284, + "grad_norm": 0.7199681254847712, + "learning_rate": 5.227396541465652e-05, + "loss": 0.8741, + "step": 5081 + }, + { + "epoch": 2.3037171350861287, + "grad_norm": 0.6846495905015629, + "learning_rate": 5.22619167032533e-05, + "loss": 0.8857, + "step": 5082 + }, + { + "epoch": 2.3041704442429736, + "grad_norm": 0.5798560199356722, + "learning_rate": 5.224986676378309e-05, + "loss": 0.8861, + "step": 5083 + }, + { + "epoch": 2.304623753399819, + "grad_norm": 0.46111812129050866, + "learning_rate": 5.2237815597452715e-05, + "loss": 0.8653, + "step": 5084 + }, + { + "epoch": 2.3050770625566637, + "grad_norm": 0.3909274590536595, + "learning_rate": 5.222576320546914e-05, + "loss": 0.8861, + "step": 5085 + }, + { + "epoch": 2.3055303717135085, + "grad_norm": 0.2876745681122052, + "learning_rate": 5.2213709589039453e-05, + "loss": 0.8916, + "step": 5086 + }, + { + "epoch": 2.3059836808703538, + "grad_norm": 0.33634083865989295, + "learning_rate": 5.2201654749370854e-05, + "loss": 0.8804, + "step": 5087 + }, + { + "epoch": 2.3064369900271986, + "grad_norm": 0.3899474401517561, + "learning_rate": 5.218959868767068e-05, + "loss": 0.8913, + "step": 5088 + }, + { + "epoch": 2.3068902991840434, + "grad_norm": 0.4146770781301267, + "learning_rate": 5.217754140514636e-05, + "loss": 0.8573, + "step": 5089 + }, + { + "epoch": 2.3073436083408883, + "grad_norm": 0.474554635414761, + "learning_rate": 5.216548290300548e-05, + "loss": 0.8843, + "step": 5090 + }, + { + "epoch": 2.3077969174977335, + "grad_norm": 0.49725443322927454, + "learning_rate": 5.215342318245574e-05, + "loss": 0.8905, + "step": 5091 + }, + { + "epoch": 2.3082502266545784, + "grad_norm": 0.47623548271031363, + "learning_rate": 5.214136224470495e-05, + "loss": 0.8848, + "step": 5092 + }, + { + "epoch": 2.308703535811423, + "grad_norm": 0.4500851399358742, + "learning_rate": 5.2129300090961055e-05, + "loss": 0.8852, + "step": 5093 + }, + { + "epoch": 2.3091568449682685, + "grad_norm": 0.5339941243214759, + "learning_rate": 5.21172367224321e-05, + "loss": 0.8705, + "step": 5094 + }, + { + "epoch": 2.3096101541251133, + "grad_norm": 0.5258800180250066, + "learning_rate": 5.210517214032627e-05, + "loss": 0.8714, + "step": 5095 + }, + { + "epoch": 2.310063463281958, + "grad_norm": 0.5427709513100741, + "learning_rate": 5.209310634585189e-05, + "loss": 0.8841, + "step": 5096 + }, + { + "epoch": 2.3105167724388034, + "grad_norm": 0.5748053894364157, + "learning_rate": 5.208103934021736e-05, + "loss": 0.8724, + "step": 5097 + }, + { + "epoch": 2.310970081595648, + "grad_norm": 0.5421089702955039, + "learning_rate": 5.2068971124631235e-05, + "loss": 0.8944, + "step": 5098 + }, + { + "epoch": 2.311423390752493, + "grad_norm": 0.45997215452241474, + "learning_rate": 5.205690170030218e-05, + "loss": 0.8936, + "step": 5099 + }, + { + "epoch": 2.3118766999093383, + "grad_norm": 0.40936236295663997, + "learning_rate": 5.204483106843897e-05, + "loss": 0.8723, + "step": 5100 + }, + { + "epoch": 2.312330009066183, + "grad_norm": 0.2949387748270129, + "learning_rate": 5.203275923025055e-05, + "loss": 0.8671, + "step": 5101 + }, + { + "epoch": 2.312783318223028, + "grad_norm": 0.3094812912814581, + "learning_rate": 5.2020686186945935e-05, + "loss": 0.8694, + "step": 5102 + }, + { + "epoch": 2.3132366273798732, + "grad_norm": 0.353427943237671, + "learning_rate": 5.200861193973426e-05, + "loss": 0.8669, + "step": 5103 + }, + { + "epoch": 2.313689936536718, + "grad_norm": 0.3488729459474434, + "learning_rate": 5.19965364898248e-05, + "loss": 0.8737, + "step": 5104 + }, + { + "epoch": 2.314143245693563, + "grad_norm": 0.4445600216545959, + "learning_rate": 5.1984459838426955e-05, + "loss": 0.8708, + "step": 5105 + }, + { + "epoch": 2.314596554850408, + "grad_norm": 0.5475516803230025, + "learning_rate": 5.197238198675024e-05, + "loss": 0.8804, + "step": 5106 + }, + { + "epoch": 2.315049864007253, + "grad_norm": 0.65000068529758, + "learning_rate": 5.196030293600428e-05, + "loss": 0.8956, + "step": 5107 + }, + { + "epoch": 2.315503173164098, + "grad_norm": 0.6362544307959659, + "learning_rate": 5.194822268739883e-05, + "loss": 0.87, + "step": 5108 + }, + { + "epoch": 2.315956482320943, + "grad_norm": 0.6231525014516497, + "learning_rate": 5.193614124214377e-05, + "loss": 0.896, + "step": 5109 + }, + { + "epoch": 2.316409791477788, + "grad_norm": 0.6114404275123781, + "learning_rate": 5.1924058601449085e-05, + "loss": 0.8542, + "step": 5110 + }, + { + "epoch": 2.3168631006346327, + "grad_norm": 0.6303350757393784, + "learning_rate": 5.1911974766524874e-05, + "loss": 0.8947, + "step": 5111 + }, + { + "epoch": 2.317316409791478, + "grad_norm": 0.6551065531583496, + "learning_rate": 5.18998897385814e-05, + "loss": 0.8712, + "step": 5112 + }, + { + "epoch": 2.317769718948323, + "grad_norm": 0.645933511632691, + "learning_rate": 5.188780351882899e-05, + "loss": 0.8861, + "step": 5113 + }, + { + "epoch": 2.3182230281051677, + "grad_norm": 0.5406105309882809, + "learning_rate": 5.187571610847811e-05, + "loss": 0.8948, + "step": 5114 + }, + { + "epoch": 2.3186763372620125, + "grad_norm": 0.5052709117402752, + "learning_rate": 5.186362750873937e-05, + "loss": 0.8677, + "step": 5115 + }, + { + "epoch": 2.3191296464188578, + "grad_norm": 0.5146110398280785, + "learning_rate": 5.185153772082346e-05, + "loss": 0.8694, + "step": 5116 + }, + { + "epoch": 2.3195829555757026, + "grad_norm": 0.5324263665621561, + "learning_rate": 5.1839446745941215e-05, + "loss": 0.892, + "step": 5117 + }, + { + "epoch": 2.3200362647325474, + "grad_norm": 0.6288912165051124, + "learning_rate": 5.182735458530358e-05, + "loss": 0.8761, + "step": 5118 + }, + { + "epoch": 2.3204895738893927, + "grad_norm": 0.7236764553421411, + "learning_rate": 5.1815261240121635e-05, + "loss": 0.8833, + "step": 5119 + }, + { + "epoch": 2.3209428830462375, + "grad_norm": 0.6400096169365467, + "learning_rate": 5.180316671160655e-05, + "loss": 0.8816, + "step": 5120 + }, + { + "epoch": 2.3213961922030824, + "grad_norm": 0.5107271702266878, + "learning_rate": 5.179107100096963e-05, + "loss": 0.8599, + "step": 5121 + }, + { + "epoch": 2.3218495013599276, + "grad_norm": 0.4211335615162259, + "learning_rate": 5.17789741094223e-05, + "loss": 0.8912, + "step": 5122 + }, + { + "epoch": 2.3223028105167725, + "grad_norm": 0.3932709132601649, + "learning_rate": 5.176687603817608e-05, + "loss": 0.8911, + "step": 5123 + }, + { + "epoch": 2.3227561196736173, + "grad_norm": 0.33797087508426954, + "learning_rate": 5.1754776788442664e-05, + "loss": 0.8881, + "step": 5124 + }, + { + "epoch": 2.3232094288304626, + "grad_norm": 0.3478156104666955, + "learning_rate": 5.1742676361433786e-05, + "loss": 0.8736, + "step": 5125 + }, + { + "epoch": 2.3236627379873074, + "grad_norm": 0.3635186219822697, + "learning_rate": 5.173057475836137e-05, + "loss": 0.8895, + "step": 5126 + }, + { + "epoch": 2.324116047144152, + "grad_norm": 0.40953132816091764, + "learning_rate": 5.1718471980437415e-05, + "loss": 0.8846, + "step": 5127 + }, + { + "epoch": 2.324569356300997, + "grad_norm": 0.4385997826858702, + "learning_rate": 5.170636802887405e-05, + "loss": 0.864, + "step": 5128 + }, + { + "epoch": 2.3250226654578423, + "grad_norm": 0.47642934188993197, + "learning_rate": 5.1694262904883515e-05, + "loss": 0.8596, + "step": 5129 + }, + { + "epoch": 2.325475974614687, + "grad_norm": 0.46975218129461566, + "learning_rate": 5.1682156609678185e-05, + "loss": 0.8916, + "step": 5130 + }, + { + "epoch": 2.325929283771532, + "grad_norm": 0.47533295385521745, + "learning_rate": 5.1670049144470555e-05, + "loss": 0.8687, + "step": 5131 + }, + { + "epoch": 2.3263825929283772, + "grad_norm": 0.4865176149076515, + "learning_rate": 5.165794051047319e-05, + "loss": 0.8854, + "step": 5132 + }, + { + "epoch": 2.326835902085222, + "grad_norm": 0.4834853447185496, + "learning_rate": 5.1645830708898816e-05, + "loss": 0.8787, + "step": 5133 + }, + { + "epoch": 2.327289211242067, + "grad_norm": 0.42230987505976336, + "learning_rate": 5.163371974096028e-05, + "loss": 0.8871, + "step": 5134 + }, + { + "epoch": 2.327742520398912, + "grad_norm": 0.3962770133765391, + "learning_rate": 5.162160760787052e-05, + "loss": 0.861, + "step": 5135 + }, + { + "epoch": 2.328195829555757, + "grad_norm": 0.3530624325231341, + "learning_rate": 5.160949431084259e-05, + "loss": 0.8841, + "step": 5136 + }, + { + "epoch": 2.328649138712602, + "grad_norm": 0.28917553990583844, + "learning_rate": 5.1597379851089704e-05, + "loss": 0.8814, + "step": 5137 + }, + { + "epoch": 2.329102447869447, + "grad_norm": 0.3328333872464081, + "learning_rate": 5.1585264229825135e-05, + "loss": 0.8789, + "step": 5138 + }, + { + "epoch": 2.329555757026292, + "grad_norm": 0.49927567831106734, + "learning_rate": 5.15731474482623e-05, + "loss": 0.8759, + "step": 5139 + }, + { + "epoch": 2.3300090661831367, + "grad_norm": 0.6988144584216138, + "learning_rate": 5.156102950761474e-05, + "loss": 0.8844, + "step": 5140 + }, + { + "epoch": 2.330462375339982, + "grad_norm": 0.8311330340433402, + "learning_rate": 5.1548910409096095e-05, + "loss": 0.8783, + "step": 5141 + }, + { + "epoch": 2.330915684496827, + "grad_norm": 0.9418602791887024, + "learning_rate": 5.153679015392014e-05, + "loss": 0.9004, + "step": 5142 + }, + { + "epoch": 2.3313689936536717, + "grad_norm": 0.9626952103987497, + "learning_rate": 5.152466874330073e-05, + "loss": 0.8582, + "step": 5143 + }, + { + "epoch": 2.331822302810517, + "grad_norm": 0.9246694334593211, + "learning_rate": 5.151254617845188e-05, + "loss": 0.872, + "step": 5144 + }, + { + "epoch": 2.3322756119673618, + "grad_norm": 0.7778751756702601, + "learning_rate": 5.1500422460587697e-05, + "loss": 0.8928, + "step": 5145 + }, + { + "epoch": 2.3327289211242066, + "grad_norm": 0.548236573353536, + "learning_rate": 5.14882975909224e-05, + "loss": 0.8662, + "step": 5146 + }, + { + "epoch": 2.333182230281052, + "grad_norm": 0.32170145827036517, + "learning_rate": 5.1476171570670336e-05, + "loss": 0.8834, + "step": 5147 + }, + { + "epoch": 2.3336355394378967, + "grad_norm": 0.3850827734122276, + "learning_rate": 5.146404440104597e-05, + "loss": 0.8591, + "step": 5148 + }, + { + "epoch": 2.3340888485947415, + "grad_norm": 0.5410559610502989, + "learning_rate": 5.1451916083263865e-05, + "loss": 0.8655, + "step": 5149 + }, + { + "epoch": 2.334542157751587, + "grad_norm": 0.6074854269078052, + "learning_rate": 5.143978661853871e-05, + "loss": 0.8777, + "step": 5150 + }, + { + "epoch": 2.3349954669084316, + "grad_norm": 0.6189349716186807, + "learning_rate": 5.142765600808529e-05, + "loss": 0.8807, + "step": 5151 + }, + { + "epoch": 2.3354487760652765, + "grad_norm": 0.5712440914220623, + "learning_rate": 5.141552425311855e-05, + "loss": 0.8885, + "step": 5152 + }, + { + "epoch": 2.3359020852221217, + "grad_norm": 0.5784284393740415, + "learning_rate": 5.14033913548535e-05, + "loss": 0.8669, + "step": 5153 + }, + { + "epoch": 2.3363553943789666, + "grad_norm": 0.45527948857784417, + "learning_rate": 5.13912573145053e-05, + "loss": 0.8853, + "step": 5154 + }, + { + "epoch": 2.3368087035358114, + "grad_norm": 0.39111925396151764, + "learning_rate": 5.13791221332892e-05, + "loss": 0.8728, + "step": 5155 + }, + { + "epoch": 2.337262012692656, + "grad_norm": 0.4440045292547264, + "learning_rate": 5.136698581242057e-05, + "loss": 0.8754, + "step": 5156 + }, + { + "epoch": 2.3377153218495015, + "grad_norm": 0.4638442414683705, + "learning_rate": 5.13548483531149e-05, + "loss": 0.88, + "step": 5157 + }, + { + "epoch": 2.3381686310063463, + "grad_norm": 0.38543710349177207, + "learning_rate": 5.1342709756587796e-05, + "loss": 0.8807, + "step": 5158 + }, + { + "epoch": 2.338621940163191, + "grad_norm": 0.49678034230445406, + "learning_rate": 5.1330570024054974e-05, + "loss": 0.8739, + "step": 5159 + }, + { + "epoch": 2.3390752493200364, + "grad_norm": 0.4058340490379552, + "learning_rate": 5.131842915673227e-05, + "loss": 0.8647, + "step": 5160 + }, + { + "epoch": 2.3395285584768812, + "grad_norm": 0.34178387909876046, + "learning_rate": 5.130628715583562e-05, + "loss": 0.8738, + "step": 5161 + }, + { + "epoch": 2.339981867633726, + "grad_norm": 0.3707906355106236, + "learning_rate": 5.1294144022581086e-05, + "loss": 0.8777, + "step": 5162 + }, + { + "epoch": 2.3404351767905713, + "grad_norm": 0.3807551822305169, + "learning_rate": 5.128199975818483e-05, + "loss": 0.8721, + "step": 5163 + }, + { + "epoch": 2.340888485947416, + "grad_norm": 0.3483651911765005, + "learning_rate": 5.1269854363863144e-05, + "loss": 0.8693, + "step": 5164 + }, + { + "epoch": 2.341341795104261, + "grad_norm": 0.39384368360741284, + "learning_rate": 5.1257707840832423e-05, + "loss": 0.895, + "step": 5165 + }, + { + "epoch": 2.3417951042611063, + "grad_norm": 0.4293949159959352, + "learning_rate": 5.124556019030918e-05, + "loss": 0.8728, + "step": 5166 + }, + { + "epoch": 2.342248413417951, + "grad_norm": 0.38168999328726183, + "learning_rate": 5.1233411413510026e-05, + "loss": 0.8757, + "step": 5167 + }, + { + "epoch": 2.342701722574796, + "grad_norm": 0.29537106550168524, + "learning_rate": 5.1221261511651706e-05, + "loss": 0.8744, + "step": 5168 + }, + { + "epoch": 2.3431550317316407, + "grad_norm": 0.4216072083021824, + "learning_rate": 5.1209110485951074e-05, + "loss": 0.8923, + "step": 5169 + }, + { + "epoch": 2.343608340888486, + "grad_norm": 0.5193534376853847, + "learning_rate": 5.119695833762507e-05, + "loss": 0.8884, + "step": 5170 + }, + { + "epoch": 2.344061650045331, + "grad_norm": 0.4611016827861858, + "learning_rate": 5.1184805067890794e-05, + "loss": 0.873, + "step": 5171 + }, + { + "epoch": 2.3445149592021757, + "grad_norm": 0.5828022923578471, + "learning_rate": 5.117265067796541e-05, + "loss": 0.8844, + "step": 5172 + }, + { + "epoch": 2.344968268359021, + "grad_norm": 0.4337649423876924, + "learning_rate": 5.116049516906623e-05, + "loss": 0.8965, + "step": 5173 + }, + { + "epoch": 2.3454215775158658, + "grad_norm": 0.4968868377527834, + "learning_rate": 5.1148338542410657e-05, + "loss": 0.8839, + "step": 5174 + }, + { + "epoch": 2.3458748866727106, + "grad_norm": 0.4889565559009078, + "learning_rate": 5.1136180799216215e-05, + "loss": 0.8641, + "step": 5175 + }, + { + "epoch": 2.346328195829556, + "grad_norm": 0.4204921613224516, + "learning_rate": 5.112402194070053e-05, + "loss": 0.8945, + "step": 5176 + }, + { + "epoch": 2.3467815049864007, + "grad_norm": 0.34215359990741845, + "learning_rate": 5.111186196808136e-05, + "loss": 0.8677, + "step": 5177 + }, + { + "epoch": 2.3472348141432455, + "grad_norm": 0.3368382671841561, + "learning_rate": 5.1099700882576565e-05, + "loss": 0.8878, + "step": 5178 + }, + { + "epoch": 2.347688123300091, + "grad_norm": 0.2780009359193906, + "learning_rate": 5.10875386854041e-05, + "loss": 0.8943, + "step": 5179 + }, + { + "epoch": 2.3481414324569356, + "grad_norm": 0.3051221880809021, + "learning_rate": 5.1075375377782055e-05, + "loss": 0.8801, + "step": 5180 + }, + { + "epoch": 2.3485947416137805, + "grad_norm": 0.45229058526226484, + "learning_rate": 5.106321096092861e-05, + "loss": 0.8856, + "step": 5181 + }, + { + "epoch": 2.3490480507706257, + "grad_norm": 0.5533581789850144, + "learning_rate": 5.1051045436062074e-05, + "loss": 0.8851, + "step": 5182 + }, + { + "epoch": 2.3495013599274706, + "grad_norm": 0.5117937693224082, + "learning_rate": 5.103887880440086e-05, + "loss": 0.8805, + "step": 5183 + }, + { + "epoch": 2.3499546690843154, + "grad_norm": 0.44818790625902055, + "learning_rate": 5.102671106716348e-05, + "loss": 0.8866, + "step": 5184 + }, + { + "epoch": 2.3504079782411607, + "grad_norm": 0.4310051211762868, + "learning_rate": 5.101454222556859e-05, + "loss": 0.8838, + "step": 5185 + }, + { + "epoch": 2.3508612873980055, + "grad_norm": 0.4471349314639501, + "learning_rate": 5.100237228083492e-05, + "loss": 0.8713, + "step": 5186 + }, + { + "epoch": 2.3513145965548503, + "grad_norm": 0.5237026777332181, + "learning_rate": 5.0990201234181335e-05, + "loss": 0.878, + "step": 5187 + }, + { + "epoch": 2.3517679057116956, + "grad_norm": 0.5200550180688442, + "learning_rate": 5.09780290868268e-05, + "loss": 0.8679, + "step": 5188 + }, + { + "epoch": 2.3522212148685404, + "grad_norm": 0.46583372339634277, + "learning_rate": 5.096585583999039e-05, + "loss": 0.8733, + "step": 5189 + }, + { + "epoch": 2.3526745240253852, + "grad_norm": 0.4055339553221472, + "learning_rate": 5.095368149489128e-05, + "loss": 0.8642, + "step": 5190 + }, + { + "epoch": 2.3531278331822305, + "grad_norm": 0.3585713840195202, + "learning_rate": 5.094150605274878e-05, + "loss": 0.8876, + "step": 5191 + }, + { + "epoch": 2.3535811423390753, + "grad_norm": 0.33050464454301565, + "learning_rate": 5.0929329514782295e-05, + "loss": 0.8908, + "step": 5192 + }, + { + "epoch": 2.35403445149592, + "grad_norm": 0.3197175272778693, + "learning_rate": 5.091715188221133e-05, + "loss": 0.8936, + "step": 5193 + }, + { + "epoch": 2.354487760652765, + "grad_norm": 0.27084833017018656, + "learning_rate": 5.0904973156255536e-05, + "loss": 0.8747, + "step": 5194 + }, + { + "epoch": 2.3549410698096103, + "grad_norm": 0.33591749226373097, + "learning_rate": 5.089279333813461e-05, + "loss": 0.8761, + "step": 5195 + }, + { + "epoch": 2.355394378966455, + "grad_norm": 0.3053155144912048, + "learning_rate": 5.088061242906843e-05, + "loss": 0.8687, + "step": 5196 + }, + { + "epoch": 2.3558476881233, + "grad_norm": 0.34539287136017316, + "learning_rate": 5.086843043027694e-05, + "loss": 0.9042, + "step": 5197 + }, + { + "epoch": 2.356300997280145, + "grad_norm": 0.4315260495461991, + "learning_rate": 5.085624734298019e-05, + "loss": 0.8851, + "step": 5198 + }, + { + "epoch": 2.35675430643699, + "grad_norm": 0.4686442389682185, + "learning_rate": 5.084406316839838e-05, + "loss": 0.8601, + "step": 5199 + }, + { + "epoch": 2.357207615593835, + "grad_norm": 0.4239209745507728, + "learning_rate": 5.083187790775176e-05, + "loss": 0.8835, + "step": 5200 + }, + { + "epoch": 2.35766092475068, + "grad_norm": 0.36863686272694257, + "learning_rate": 5.0819691562260734e-05, + "loss": 0.8918, + "step": 5201 + }, + { + "epoch": 2.358114233907525, + "grad_norm": 0.36043656869160007, + "learning_rate": 5.0807504133145795e-05, + "loss": 0.8781, + "step": 5202 + }, + { + "epoch": 2.3585675430643698, + "grad_norm": 0.402662927846602, + "learning_rate": 5.079531562162756e-05, + "loss": 0.8551, + "step": 5203 + }, + { + "epoch": 2.359020852221215, + "grad_norm": 0.4092363468730264, + "learning_rate": 5.078312602892674e-05, + "loss": 0.8955, + "step": 5204 + }, + { + "epoch": 2.35947416137806, + "grad_norm": 0.37563392518883887, + "learning_rate": 5.077093535626415e-05, + "loss": 0.8733, + "step": 5205 + }, + { + "epoch": 2.3599274705349047, + "grad_norm": 0.37605205987525747, + "learning_rate": 5.075874360486074e-05, + "loss": 0.865, + "step": 5206 + }, + { + "epoch": 2.3603807796917495, + "grad_norm": 0.42779386906916983, + "learning_rate": 5.074655077593754e-05, + "loss": 0.8783, + "step": 5207 + }, + { + "epoch": 2.360834088848595, + "grad_norm": 0.46620694749756686, + "learning_rate": 5.073435687071569e-05, + "loss": 0.8762, + "step": 5208 + }, + { + "epoch": 2.3612873980054396, + "grad_norm": 0.4849549343669799, + "learning_rate": 5.072216189041646e-05, + "loss": 0.8794, + "step": 5209 + }, + { + "epoch": 2.3617407071622845, + "grad_norm": 0.49769038971729934, + "learning_rate": 5.0709965836261194e-05, + "loss": 0.8787, + "step": 5210 + }, + { + "epoch": 2.3621940163191297, + "grad_norm": 0.48215087041434856, + "learning_rate": 5.069776870947138e-05, + "loss": 0.8837, + "step": 5211 + }, + { + "epoch": 2.3626473254759746, + "grad_norm": 0.5942078317622705, + "learning_rate": 5.068557051126858e-05, + "loss": 0.8686, + "step": 5212 + }, + { + "epoch": 2.3631006346328194, + "grad_norm": 0.6220052861444667, + "learning_rate": 5.0673371242874496e-05, + "loss": 0.8872, + "step": 5213 + }, + { + "epoch": 2.3635539437896647, + "grad_norm": 0.6278313720819784, + "learning_rate": 5.066117090551091e-05, + "loss": 0.8743, + "step": 5214 + }, + { + "epoch": 2.3640072529465095, + "grad_norm": 0.5902794500122057, + "learning_rate": 5.064896950039973e-05, + "loss": 0.8822, + "step": 5215 + }, + { + "epoch": 2.3644605621033543, + "grad_norm": 0.585719762936012, + "learning_rate": 5.0636767028762954e-05, + "loss": 0.8818, + "step": 5216 + }, + { + "epoch": 2.3649138712601996, + "grad_norm": 0.4576986358627108, + "learning_rate": 5.06245634918227e-05, + "loss": 0.8705, + "step": 5217 + }, + { + "epoch": 2.3653671804170444, + "grad_norm": 0.3988780038973461, + "learning_rate": 5.0612358890801195e-05, + "loss": 0.8945, + "step": 5218 + }, + { + "epoch": 2.3658204895738892, + "grad_norm": 0.35860770148192933, + "learning_rate": 5.0600153226920745e-05, + "loss": 0.8495, + "step": 5219 + }, + { + "epoch": 2.3662737987307345, + "grad_norm": 0.28636248661423525, + "learning_rate": 5.05879465014038e-05, + "loss": 0.8838, + "step": 5220 + }, + { + "epoch": 2.3667271078875793, + "grad_norm": 0.31950518535018685, + "learning_rate": 5.0575738715472896e-05, + "loss": 0.8758, + "step": 5221 + }, + { + "epoch": 2.367180417044424, + "grad_norm": 0.31813777487165557, + "learning_rate": 5.0563529870350684e-05, + "loss": 0.8812, + "step": 5222 + }, + { + "epoch": 2.3676337262012694, + "grad_norm": 0.37726297520150653, + "learning_rate": 5.0551319967259906e-05, + "loss": 0.8863, + "step": 5223 + }, + { + "epoch": 2.3680870353581143, + "grad_norm": 0.5179410195215615, + "learning_rate": 5.053910900742343e-05, + "loss": 0.8783, + "step": 5224 + }, + { + "epoch": 2.368540344514959, + "grad_norm": 0.6453536283430271, + "learning_rate": 5.05268969920642e-05, + "loss": 0.8682, + "step": 5225 + }, + { + "epoch": 2.3689936536718044, + "grad_norm": 0.7405270076768682, + "learning_rate": 5.05146839224053e-05, + "loss": 0.8826, + "step": 5226 + }, + { + "epoch": 2.369446962828649, + "grad_norm": 0.7615068210509947, + "learning_rate": 5.050246979966991e-05, + "loss": 0.9024, + "step": 5227 + }, + { + "epoch": 2.369900271985494, + "grad_norm": 0.6572450131157982, + "learning_rate": 5.04902546250813e-05, + "loss": 0.865, + "step": 5228 + }, + { + "epoch": 2.3703535811423393, + "grad_norm": 0.549040017268499, + "learning_rate": 5.0478038399862866e-05, + "loss": 0.9026, + "step": 5229 + }, + { + "epoch": 2.370806890299184, + "grad_norm": 0.4202395326512334, + "learning_rate": 5.046582112523808e-05, + "loss": 0.8717, + "step": 5230 + }, + { + "epoch": 2.371260199456029, + "grad_norm": 0.39389661606813775, + "learning_rate": 5.045360280243056e-05, + "loss": 0.8662, + "step": 5231 + }, + { + "epoch": 2.371713508612874, + "grad_norm": 0.47236877116229703, + "learning_rate": 5.044138343266399e-05, + "loss": 0.8905, + "step": 5232 + }, + { + "epoch": 2.372166817769719, + "grad_norm": 0.5389172595883874, + "learning_rate": 5.0429163017162175e-05, + "loss": 0.8921, + "step": 5233 + }, + { + "epoch": 2.372620126926564, + "grad_norm": 0.7351333476778535, + "learning_rate": 5.041694155714904e-05, + "loss": 0.8802, + "step": 5234 + }, + { + "epoch": 2.3730734360834087, + "grad_norm": 0.8304937984465337, + "learning_rate": 5.040471905384861e-05, + "loss": 0.8817, + "step": 5235 + }, + { + "epoch": 2.373526745240254, + "grad_norm": 0.8121675164533314, + "learning_rate": 5.0392495508484966e-05, + "loss": 0.8856, + "step": 5236 + }, + { + "epoch": 2.373980054397099, + "grad_norm": 0.750201924191632, + "learning_rate": 5.038027092228237e-05, + "loss": 0.8871, + "step": 5237 + }, + { + "epoch": 2.3744333635539436, + "grad_norm": 0.7492833077704567, + "learning_rate": 5.036804529646513e-05, + "loss": 0.8758, + "step": 5238 + }, + { + "epoch": 2.374886672710789, + "grad_norm": 0.6656684449444542, + "learning_rate": 5.035581863225767e-05, + "loss": 0.8764, + "step": 5239 + }, + { + "epoch": 2.3753399818676337, + "grad_norm": 0.43498898544110703, + "learning_rate": 5.034359093088455e-05, + "loss": 0.8588, + "step": 5240 + }, + { + "epoch": 2.3757932910244786, + "grad_norm": 0.343128537314031, + "learning_rate": 5.033136219357039e-05, + "loss": 0.8787, + "step": 5241 + }, + { + "epoch": 2.376246600181324, + "grad_norm": 0.4291740883524962, + "learning_rate": 5.031913242153995e-05, + "loss": 0.8882, + "step": 5242 + }, + { + "epoch": 2.3766999093381687, + "grad_norm": 0.4324491738175963, + "learning_rate": 5.030690161601807e-05, + "loss": 0.8839, + "step": 5243 + }, + { + "epoch": 2.3771532184950135, + "grad_norm": 0.48980122881350835, + "learning_rate": 5.02946697782297e-05, + "loss": 0.8925, + "step": 5244 + }, + { + "epoch": 2.3776065276518588, + "grad_norm": 0.5409310846802745, + "learning_rate": 5.02824369093999e-05, + "loss": 0.8954, + "step": 5245 + }, + { + "epoch": 2.3780598368087036, + "grad_norm": 0.579761531181306, + "learning_rate": 5.027020301075382e-05, + "loss": 0.8856, + "step": 5246 + }, + { + "epoch": 2.3785131459655484, + "grad_norm": 0.6588997067023833, + "learning_rate": 5.0257968083516735e-05, + "loss": 0.858, + "step": 5247 + }, + { + "epoch": 2.3789664551223932, + "grad_norm": 0.5986181971023108, + "learning_rate": 5.024573212891398e-05, + "loss": 0.8839, + "step": 5248 + }, + { + "epoch": 2.3794197642792385, + "grad_norm": 0.46441796761461607, + "learning_rate": 5.023349514817105e-05, + "loss": 0.8831, + "step": 5249 + }, + { + "epoch": 2.3798730734360833, + "grad_norm": 0.3809239040926077, + "learning_rate": 5.022125714251352e-05, + "loss": 0.8824, + "step": 5250 + }, + { + "epoch": 2.380326382592928, + "grad_norm": 0.3373230625232598, + "learning_rate": 5.020901811316703e-05, + "loss": 0.8646, + "step": 5251 + }, + { + "epoch": 2.3807796917497734, + "grad_norm": 0.3937276361406932, + "learning_rate": 5.019677806135738e-05, + "loss": 0.8742, + "step": 5252 + }, + { + "epoch": 2.3812330009066183, + "grad_norm": 0.4161859671329133, + "learning_rate": 5.018453698831043e-05, + "loss": 0.8858, + "step": 5253 + }, + { + "epoch": 2.381686310063463, + "grad_norm": 0.40599482548173077, + "learning_rate": 5.017229489525218e-05, + "loss": 0.8753, + "step": 5254 + }, + { + "epoch": 2.3821396192203084, + "grad_norm": 0.4474202559210028, + "learning_rate": 5.0160051783408684e-05, + "loss": 0.8952, + "step": 5255 + }, + { + "epoch": 2.382592928377153, + "grad_norm": 0.5674266096556327, + "learning_rate": 5.014780765400616e-05, + "loss": 0.8689, + "step": 5256 + }, + { + "epoch": 2.383046237533998, + "grad_norm": 0.6500478533095756, + "learning_rate": 5.013556250827087e-05, + "loss": 0.8757, + "step": 5257 + }, + { + "epoch": 2.3834995466908433, + "grad_norm": 0.599929948729522, + "learning_rate": 5.0123316347429204e-05, + "loss": 0.8733, + "step": 5258 + }, + { + "epoch": 2.383952855847688, + "grad_norm": 0.4745172132030613, + "learning_rate": 5.011106917270766e-05, + "loss": 0.8928, + "step": 5259 + }, + { + "epoch": 2.384406165004533, + "grad_norm": 0.3359975578868226, + "learning_rate": 5.009882098533281e-05, + "loss": 0.8847, + "step": 5260 + }, + { + "epoch": 2.3848594741613782, + "grad_norm": 0.31876293983347137, + "learning_rate": 5.0086571786531366e-05, + "loss": 0.8823, + "step": 5261 + }, + { + "epoch": 2.385312783318223, + "grad_norm": 0.30373970289921975, + "learning_rate": 5.007432157753011e-05, + "loss": 0.8918, + "step": 5262 + }, + { + "epoch": 2.385766092475068, + "grad_norm": 0.40213496669218873, + "learning_rate": 5.006207035955595e-05, + "loss": 0.8648, + "step": 5263 + }, + { + "epoch": 2.386219401631913, + "grad_norm": 0.47114201162898983, + "learning_rate": 5.004981813383587e-05, + "loss": 0.8622, + "step": 5264 + }, + { + "epoch": 2.386672710788758, + "grad_norm": 0.4773399681793328, + "learning_rate": 5.003756490159696e-05, + "loss": 0.9019, + "step": 5265 + }, + { + "epoch": 2.387126019945603, + "grad_norm": 0.4515997008883914, + "learning_rate": 5.0025310664066435e-05, + "loss": 0.8725, + "step": 5266 + }, + { + "epoch": 2.387579329102448, + "grad_norm": 0.3722445707278127, + "learning_rate": 5.0013055422471575e-05, + "loss": 0.9049, + "step": 5267 + }, + { + "epoch": 2.388032638259293, + "grad_norm": 0.2990378259026441, + "learning_rate": 5.000079917803979e-05, + "loss": 0.9035, + "step": 5268 + }, + { + "epoch": 2.3884859474161377, + "grad_norm": 0.3355148944712761, + "learning_rate": 4.998854193199857e-05, + "loss": 0.8845, + "step": 5269 + }, + { + "epoch": 2.388939256572983, + "grad_norm": 0.4064008826711339, + "learning_rate": 4.9976283685575516e-05, + "loss": 0.8843, + "step": 5270 + }, + { + "epoch": 2.389392565729828, + "grad_norm": 0.37751422665512047, + "learning_rate": 4.996402443999834e-05, + "loss": 0.8905, + "step": 5271 + }, + { + "epoch": 2.3898458748866727, + "grad_norm": 0.38138479335759873, + "learning_rate": 4.995176419649483e-05, + "loss": 0.903, + "step": 5272 + }, + { + "epoch": 2.3902991840435175, + "grad_norm": 0.5221723672300087, + "learning_rate": 4.993950295629288e-05, + "loss": 0.878, + "step": 5273 + }, + { + "epoch": 2.3907524932003628, + "grad_norm": 0.5252059246503796, + "learning_rate": 4.992724072062052e-05, + "loss": 0.8627, + "step": 5274 + }, + { + "epoch": 2.3912058023572076, + "grad_norm": 0.41054216432765633, + "learning_rate": 4.991497749070582e-05, + "loss": 0.8763, + "step": 5275 + }, + { + "epoch": 2.3916591115140524, + "grad_norm": 0.3978903455568497, + "learning_rate": 4.9902713267776965e-05, + "loss": 0.8776, + "step": 5276 + }, + { + "epoch": 2.3921124206708977, + "grad_norm": 0.39284796261897426, + "learning_rate": 4.989044805306229e-05, + "loss": 0.858, + "step": 5277 + }, + { + "epoch": 2.3925657298277425, + "grad_norm": 0.3483218676061506, + "learning_rate": 4.987818184779016e-05, + "loss": 0.8899, + "step": 5278 + }, + { + "epoch": 2.3930190389845873, + "grad_norm": 0.35165919445789395, + "learning_rate": 4.9865914653189116e-05, + "loss": 0.8849, + "step": 5279 + }, + { + "epoch": 2.3934723481414326, + "grad_norm": 0.4376935752530494, + "learning_rate": 4.98536464704877e-05, + "loss": 0.8794, + "step": 5280 + }, + { + "epoch": 2.3939256572982774, + "grad_norm": 0.418201631430149, + "learning_rate": 4.984137730091464e-05, + "loss": 0.8744, + "step": 5281 + }, + { + "epoch": 2.3943789664551223, + "grad_norm": 0.23540224254580458, + "learning_rate": 4.9829107145698706e-05, + "loss": 0.8685, + "step": 5282 + }, + { + "epoch": 2.3948322756119675, + "grad_norm": 0.3991971719998409, + "learning_rate": 4.9816836006068814e-05, + "loss": 0.8701, + "step": 5283 + }, + { + "epoch": 2.3952855847688124, + "grad_norm": 0.5188715496648005, + "learning_rate": 4.9804563883253946e-05, + "loss": 0.895, + "step": 5284 + }, + { + "epoch": 2.395738893925657, + "grad_norm": 0.4701030890855219, + "learning_rate": 4.9792290778483195e-05, + "loss": 0.8802, + "step": 5285 + }, + { + "epoch": 2.396192203082502, + "grad_norm": 0.43787442100343343, + "learning_rate": 4.978001669298573e-05, + "loss": 0.8757, + "step": 5286 + }, + { + "epoch": 2.3966455122393473, + "grad_norm": 0.45217150162321756, + "learning_rate": 4.9767741627990864e-05, + "loss": 0.8603, + "step": 5287 + }, + { + "epoch": 2.397098821396192, + "grad_norm": 0.46462237969747927, + "learning_rate": 4.975546558472795e-05, + "loss": 0.8857, + "step": 5288 + }, + { + "epoch": 2.397552130553037, + "grad_norm": 0.4682296254190874, + "learning_rate": 4.97431885644265e-05, + "loss": 0.889, + "step": 5289 + }, + { + "epoch": 2.3980054397098822, + "grad_norm": 0.43044893115556787, + "learning_rate": 4.9730910568316074e-05, + "loss": 0.8859, + "step": 5290 + }, + { + "epoch": 2.398458748866727, + "grad_norm": 0.44493871377918137, + "learning_rate": 4.9718631597626366e-05, + "loss": 0.8736, + "step": 5291 + }, + { + "epoch": 2.398912058023572, + "grad_norm": 0.45817808997378673, + "learning_rate": 4.970635165358714e-05, + "loss": 0.9001, + "step": 5292 + }, + { + "epoch": 2.399365367180417, + "grad_norm": 0.34489322238493697, + "learning_rate": 4.969407073742826e-05, + "loss": 0.8771, + "step": 5293 + }, + { + "epoch": 2.399818676337262, + "grad_norm": 0.3023256652167846, + "learning_rate": 4.968178885037972e-05, + "loss": 0.8867, + "step": 5294 + }, + { + "epoch": 2.400271985494107, + "grad_norm": 0.3253410559685095, + "learning_rate": 4.966950599367156e-05, + "loss": 0.8602, + "step": 5295 + }, + { + "epoch": 2.400725294650952, + "grad_norm": 0.36854918173878815, + "learning_rate": 4.965722216853396e-05, + "loss": 0.8857, + "step": 5296 + }, + { + "epoch": 2.401178603807797, + "grad_norm": 0.4013378516681496, + "learning_rate": 4.964493737619717e-05, + "loss": 0.8632, + "step": 5297 + }, + { + "epoch": 2.4016319129646417, + "grad_norm": 0.36633414043186185, + "learning_rate": 4.9632651617891564e-05, + "loss": 0.8735, + "step": 5298 + }, + { + "epoch": 2.402085222121487, + "grad_norm": 0.3814883268949225, + "learning_rate": 4.962036489484758e-05, + "loss": 0.8862, + "step": 5299 + }, + { + "epoch": 2.402538531278332, + "grad_norm": 0.4818615981477088, + "learning_rate": 4.9608077208295784e-05, + "loss": 0.8679, + "step": 5300 + }, + { + "epoch": 2.4029918404351767, + "grad_norm": 0.4767901520874524, + "learning_rate": 4.9595788559466825e-05, + "loss": 0.8835, + "step": 5301 + }, + { + "epoch": 2.403445149592022, + "grad_norm": 0.39935492503589937, + "learning_rate": 4.958349894959143e-05, + "loss": 0.9042, + "step": 5302 + }, + { + "epoch": 2.4038984587488668, + "grad_norm": 0.32326988315021454, + "learning_rate": 4.9571208379900456e-05, + "loss": 0.8722, + "step": 5303 + }, + { + "epoch": 2.4043517679057116, + "grad_norm": 0.23852917856534878, + "learning_rate": 4.955891685162483e-05, + "loss": 0.8726, + "step": 5304 + }, + { + "epoch": 2.404805077062557, + "grad_norm": 0.2916444459230298, + "learning_rate": 4.9546624365995594e-05, + "loss": 0.8761, + "step": 5305 + }, + { + "epoch": 2.4052583862194017, + "grad_norm": 0.34670697920610843, + "learning_rate": 4.953433092424386e-05, + "loss": 0.8583, + "step": 5306 + }, + { + "epoch": 2.4057116953762465, + "grad_norm": 0.42533635847561707, + "learning_rate": 4.952203652760088e-05, + "loss": 0.8598, + "step": 5307 + }, + { + "epoch": 2.406165004533092, + "grad_norm": 0.47718051095318076, + "learning_rate": 4.9509741177297945e-05, + "loss": 0.8785, + "step": 5308 + }, + { + "epoch": 2.4066183136899366, + "grad_norm": 0.5286572497346848, + "learning_rate": 4.949744487456649e-05, + "loss": 0.8922, + "step": 5309 + }, + { + "epoch": 2.4070716228467814, + "grad_norm": 0.559625511273886, + "learning_rate": 4.9485147620638e-05, + "loss": 0.8713, + "step": 5310 + }, + { + "epoch": 2.4075249320036267, + "grad_norm": 1.6649756358300813, + "learning_rate": 4.947284941674411e-05, + "loss": 0.9121, + "step": 5311 + }, + { + "epoch": 2.4079782411604715, + "grad_norm": 0.4087308094746959, + "learning_rate": 4.9460550264116517e-05, + "loss": 0.8632, + "step": 5312 + }, + { + "epoch": 2.4084315503173164, + "grad_norm": 0.9911140040831391, + "learning_rate": 4.944825016398701e-05, + "loss": 0.8726, + "step": 5313 + }, + { + "epoch": 2.408884859474161, + "grad_norm": 1.3141164112103585, + "learning_rate": 4.943594911758748e-05, + "loss": 0.8734, + "step": 5314 + }, + { + "epoch": 2.4093381686310065, + "grad_norm": 0.47921865905917294, + "learning_rate": 4.942364712614991e-05, + "loss": 0.8851, + "step": 5315 + }, + { + "epoch": 2.4097914777878513, + "grad_norm": 0.9008367728581655, + "learning_rate": 4.941134419090638e-05, + "loss": 0.8833, + "step": 5316 + }, + { + "epoch": 2.410244786944696, + "grad_norm": 1.5022530685709574, + "learning_rate": 4.939904031308908e-05, + "loss": 0.8905, + "step": 5317 + }, + { + "epoch": 2.4106980961015414, + "grad_norm": 0.45219173804991947, + "learning_rate": 4.938673549393026e-05, + "loss": 0.8647, + "step": 5318 + }, + { + "epoch": 2.4111514052583862, + "grad_norm": 1.5190519457857659, + "learning_rate": 4.9374429734662296e-05, + "loss": 0.8789, + "step": 5319 + }, + { + "epoch": 2.411604714415231, + "grad_norm": 0.642820454199188, + "learning_rate": 4.936212303651766e-05, + "loss": 0.8785, + "step": 5320 + }, + { + "epoch": 2.4120580235720763, + "grad_norm": 1.1985528901056135, + "learning_rate": 4.934981540072887e-05, + "loss": 0.8581, + "step": 5321 + }, + { + "epoch": 2.412511332728921, + "grad_norm": 1.1159874551073923, + "learning_rate": 4.933750682852859e-05, + "loss": 0.9068, + "step": 5322 + }, + { + "epoch": 2.412964641885766, + "grad_norm": 0.8901439040069259, + "learning_rate": 4.9325197321149566e-05, + "loss": 0.8874, + "step": 5323 + }, + { + "epoch": 2.4134179510426113, + "grad_norm": 1.1768617584413759, + "learning_rate": 4.931288687982462e-05, + "loss": 0.8861, + "step": 5324 + }, + { + "epoch": 2.413871260199456, + "grad_norm": 0.7111282791003398, + "learning_rate": 4.930057550578668e-05, + "loss": 0.8869, + "step": 5325 + }, + { + "epoch": 2.414324569356301, + "grad_norm": 0.8333406004627978, + "learning_rate": 4.9288263200268776e-05, + "loss": 0.8676, + "step": 5326 + }, + { + "epoch": 2.4147778785131457, + "grad_norm": 0.9604553888190404, + "learning_rate": 4.9275949964504e-05, + "loss": 0.8629, + "step": 5327 + }, + { + "epoch": 2.415231187669991, + "grad_norm": 0.9755110167331937, + "learning_rate": 4.9263635799725584e-05, + "loss": 0.8815, + "step": 5328 + }, + { + "epoch": 2.415684496826836, + "grad_norm": 0.7880914004844906, + "learning_rate": 4.925132070716682e-05, + "loss": 0.8991, + "step": 5329 + }, + { + "epoch": 2.4161378059836807, + "grad_norm": 0.5859988383485621, + "learning_rate": 4.923900468806109e-05, + "loss": 0.8754, + "step": 5330 + }, + { + "epoch": 2.416591115140526, + "grad_norm": 0.772890441123839, + "learning_rate": 4.922668774364189e-05, + "loss": 0.86, + "step": 5331 + }, + { + "epoch": 2.4170444242973708, + "grad_norm": 0.7029177749858493, + "learning_rate": 4.9214369875142796e-05, + "loss": 0.8619, + "step": 5332 + }, + { + "epoch": 2.4174977334542156, + "grad_norm": 0.7938140352681888, + "learning_rate": 4.920205108379749e-05, + "loss": 0.8808, + "step": 5333 + }, + { + "epoch": 2.417951042611061, + "grad_norm": 1.0111763653622272, + "learning_rate": 4.918973137083972e-05, + "loss": 0.866, + "step": 5334 + }, + { + "epoch": 2.4184043517679057, + "grad_norm": 0.7158813145121151, + "learning_rate": 4.9177410737503336e-05, + "loss": 0.8848, + "step": 5335 + }, + { + "epoch": 2.4188576609247505, + "grad_norm": 0.5169104953053236, + "learning_rate": 4.91650891850223e-05, + "loss": 0.9032, + "step": 5336 + }, + { + "epoch": 2.419310970081596, + "grad_norm": 0.642535143786341, + "learning_rate": 4.915276671463064e-05, + "loss": 0.8678, + "step": 5337 + }, + { + "epoch": 2.4197642792384406, + "grad_norm": 0.6185615625612365, + "learning_rate": 4.91404433275625e-05, + "loss": 0.8888, + "step": 5338 + }, + { + "epoch": 2.4202175883952854, + "grad_norm": 0.5914977776951622, + "learning_rate": 4.912811902505209e-05, + "loss": 0.8724, + "step": 5339 + }, + { + "epoch": 2.4206708975521307, + "grad_norm": 0.5500711531683257, + "learning_rate": 4.9115793808333736e-05, + "loss": 0.8756, + "step": 5340 + }, + { + "epoch": 2.4211242067089755, + "grad_norm": 0.43145703142421565, + "learning_rate": 4.910346767864185e-05, + "loss": 0.8712, + "step": 5341 + }, + { + "epoch": 2.4215775158658204, + "grad_norm": 0.4143373357256692, + "learning_rate": 4.909114063721092e-05, + "loss": 0.872, + "step": 5342 + }, + { + "epoch": 2.4220308250226656, + "grad_norm": 0.46466776143750127, + "learning_rate": 4.9078812685275534e-05, + "loss": 0.8866, + "step": 5343 + }, + { + "epoch": 2.4224841341795105, + "grad_norm": 0.47229848964360904, + "learning_rate": 4.906648382407037e-05, + "loss": 0.8692, + "step": 5344 + }, + { + "epoch": 2.4229374433363553, + "grad_norm": 0.5678833161912438, + "learning_rate": 4.905415405483021e-05, + "loss": 0.8942, + "step": 5345 + }, + { + "epoch": 2.4233907524932006, + "grad_norm": 0.5818330989239985, + "learning_rate": 4.904182337878991e-05, + "loss": 0.8963, + "step": 5346 + }, + { + "epoch": 2.4238440616500454, + "grad_norm": 0.3293747918885042, + "learning_rate": 4.902949179718442e-05, + "loss": 0.8734, + "step": 5347 + }, + { + "epoch": 2.4242973708068902, + "grad_norm": 0.3659982104726782, + "learning_rate": 4.90171593112488e-05, + "loss": 0.8593, + "step": 5348 + }, + { + "epoch": 2.4247506799637355, + "grad_norm": 0.48225680450863145, + "learning_rate": 4.900482592221817e-05, + "loss": 0.889, + "step": 5349 + }, + { + "epoch": 2.4252039891205803, + "grad_norm": 0.3894044916082666, + "learning_rate": 4.899249163132775e-05, + "loss": 0.8826, + "step": 5350 + }, + { + "epoch": 2.425657298277425, + "grad_norm": 0.38753294577507386, + "learning_rate": 4.8980156439812875e-05, + "loss": 0.8596, + "step": 5351 + }, + { + "epoch": 2.42611060743427, + "grad_norm": 0.5611777184904975, + "learning_rate": 4.896782034890894e-05, + "loss": 0.8746, + "step": 5352 + }, + { + "epoch": 2.4265639165911153, + "grad_norm": 0.48845870220857457, + "learning_rate": 4.895548335985144e-05, + "loss": 0.8805, + "step": 5353 + }, + { + "epoch": 2.42701722574796, + "grad_norm": 0.37164298002219937, + "learning_rate": 4.894314547387597e-05, + "loss": 0.8578, + "step": 5354 + }, + { + "epoch": 2.427470534904805, + "grad_norm": 0.42240882357049603, + "learning_rate": 4.893080669221818e-05, + "loss": 0.8677, + "step": 5355 + }, + { + "epoch": 2.42792384406165, + "grad_norm": 0.44143384399888735, + "learning_rate": 4.891846701611385e-05, + "loss": 0.8668, + "step": 5356 + }, + { + "epoch": 2.428377153218495, + "grad_norm": 0.3451278203750534, + "learning_rate": 4.8906126446798855e-05, + "loss": 0.8994, + "step": 5357 + }, + { + "epoch": 2.42883046237534, + "grad_norm": 0.341554364707886, + "learning_rate": 4.889378498550912e-05, + "loss": 0.8709, + "step": 5358 + }, + { + "epoch": 2.429283771532185, + "grad_norm": 0.3274947357226111, + "learning_rate": 4.888144263348067e-05, + "loss": 0.8751, + "step": 5359 + }, + { + "epoch": 2.42973708068903, + "grad_norm": 0.33855700938694594, + "learning_rate": 4.886909939194964e-05, + "loss": 0.8876, + "step": 5360 + }, + { + "epoch": 2.4301903898458748, + "grad_norm": 0.29724210776160914, + "learning_rate": 4.885675526215226e-05, + "loss": 0.8739, + "step": 5361 + }, + { + "epoch": 2.43064369900272, + "grad_norm": 0.29074525376179555, + "learning_rate": 4.884441024532479e-05, + "loss": 0.9135, + "step": 5362 + }, + { + "epoch": 2.431097008159565, + "grad_norm": 0.3466653007912508, + "learning_rate": 4.883206434270363e-05, + "loss": 0.867, + "step": 5363 + }, + { + "epoch": 2.4315503173164097, + "grad_norm": 0.34054967214749576, + "learning_rate": 4.88197175555253e-05, + "loss": 0.8667, + "step": 5364 + }, + { + "epoch": 2.4320036264732545, + "grad_norm": 0.3116610197961883, + "learning_rate": 4.880736988502632e-05, + "loss": 0.8761, + "step": 5365 + }, + { + "epoch": 2.4324569356301, + "grad_norm": 0.3407841897692054, + "learning_rate": 4.879502133244336e-05, + "loss": 0.8735, + "step": 5366 + }, + { + "epoch": 2.4329102447869446, + "grad_norm": 0.39237164153880744, + "learning_rate": 4.878267189901317e-05, + "loss": 0.8726, + "step": 5367 + }, + { + "epoch": 2.4333635539437894, + "grad_norm": 0.3804511172518264, + "learning_rate": 4.8770321585972575e-05, + "loss": 0.8808, + "step": 5368 + }, + { + "epoch": 2.4338168631006347, + "grad_norm": 0.3470490241838002, + "learning_rate": 4.875797039455849e-05, + "loss": 0.8648, + "step": 5369 + }, + { + "epoch": 2.4342701722574795, + "grad_norm": 0.3427476056245708, + "learning_rate": 4.874561832600794e-05, + "loss": 0.8684, + "step": 5370 + }, + { + "epoch": 2.4347234814143244, + "grad_norm": 0.28405800346058446, + "learning_rate": 4.873326538155802e-05, + "loss": 0.8787, + "step": 5371 + }, + { + "epoch": 2.4351767905711696, + "grad_norm": 0.2595780592468819, + "learning_rate": 4.8720911562445896e-05, + "loss": 0.8675, + "step": 5372 + }, + { + "epoch": 2.4356300997280145, + "grad_norm": 0.3359139785905444, + "learning_rate": 4.870855686990885e-05, + "loss": 0.853, + "step": 5373 + }, + { + "epoch": 2.4360834088848593, + "grad_norm": 0.4306238310666022, + "learning_rate": 4.8696201305184235e-05, + "loss": 0.873, + "step": 5374 + }, + { + "epoch": 2.4365367180417046, + "grad_norm": 0.4862118549873189, + "learning_rate": 4.86838448695095e-05, + "loss": 0.878, + "step": 5375 + }, + { + "epoch": 2.4369900271985494, + "grad_norm": 0.48034229139730245, + "learning_rate": 4.867148756412219e-05, + "loss": 0.8803, + "step": 5376 + }, + { + "epoch": 2.4374433363553942, + "grad_norm": 0.3873583583167069, + "learning_rate": 4.8659129390259916e-05, + "loss": 0.8854, + "step": 5377 + }, + { + "epoch": 2.4378966455122395, + "grad_norm": 0.5287487006733117, + "learning_rate": 4.864677034916037e-05, + "loss": 0.8772, + "step": 5378 + }, + { + "epoch": 2.4383499546690843, + "grad_norm": 0.33218601975714407, + "learning_rate": 4.8634410442061376e-05, + "loss": 0.8659, + "step": 5379 + }, + { + "epoch": 2.438803263825929, + "grad_norm": 0.3013210483397206, + "learning_rate": 4.862204967020079e-05, + "loss": 0.8947, + "step": 5380 + }, + { + "epoch": 2.4392565729827744, + "grad_norm": 0.3035605189394623, + "learning_rate": 4.86096880348166e-05, + "loss": 0.8759, + "step": 5381 + }, + { + "epoch": 2.4397098821396193, + "grad_norm": 0.3365434412978244, + "learning_rate": 4.859732553714684e-05, + "loss": 0.8615, + "step": 5382 + }, + { + "epoch": 2.440163191296464, + "grad_norm": 0.4440664398186987, + "learning_rate": 4.858496217842967e-05, + "loss": 0.8811, + "step": 5383 + }, + { + "epoch": 2.4406165004533094, + "grad_norm": 0.46360078914224767, + "learning_rate": 4.85725979599033e-05, + "loss": 0.881, + "step": 5384 + }, + { + "epoch": 2.441069809610154, + "grad_norm": 0.410080854575634, + "learning_rate": 4.8560232882806046e-05, + "loss": 0.8718, + "step": 5385 + }, + { + "epoch": 2.441523118766999, + "grad_norm": 0.39224265698071087, + "learning_rate": 4.854786694837632e-05, + "loss": 0.8895, + "step": 5386 + }, + { + "epoch": 2.4419764279238443, + "grad_norm": 0.3950127304075945, + "learning_rate": 4.853550015785259e-05, + "loss": 0.88, + "step": 5387 + }, + { + "epoch": 2.442429737080689, + "grad_norm": 0.42424956479796566, + "learning_rate": 4.8523132512473436e-05, + "loss": 0.8742, + "step": 5388 + }, + { + "epoch": 2.442883046237534, + "grad_norm": 0.4184611110679396, + "learning_rate": 4.851076401347751e-05, + "loss": 0.8618, + "step": 5389 + }, + { + "epoch": 2.443336355394379, + "grad_norm": 0.3547638385885123, + "learning_rate": 4.8498394662103564e-05, + "loss": 0.8997, + "step": 5390 + }, + { + "epoch": 2.443789664551224, + "grad_norm": 0.3320813079519983, + "learning_rate": 4.8486024459590416e-05, + "loss": 0.8794, + "step": 5391 + }, + { + "epoch": 2.444242973708069, + "grad_norm": 0.4252818055953489, + "learning_rate": 4.847365340717698e-05, + "loss": 0.879, + "step": 5392 + }, + { + "epoch": 2.4446962828649137, + "grad_norm": 0.5365666754563821, + "learning_rate": 4.846128150610224e-05, + "loss": 0.8942, + "step": 5393 + }, + { + "epoch": 2.445149592021759, + "grad_norm": 0.6231584347487227, + "learning_rate": 4.84489087576053e-05, + "loss": 0.8594, + "step": 5394 + }, + { + "epoch": 2.445602901178604, + "grad_norm": 0.545890577061619, + "learning_rate": 4.843653516292531e-05, + "loss": 0.8707, + "step": 5395 + }, + { + "epoch": 2.4460562103354486, + "grad_norm": 0.5209378103497359, + "learning_rate": 4.842416072330152e-05, + "loss": 0.8676, + "step": 5396 + }, + { + "epoch": 2.446509519492294, + "grad_norm": 0.47848765856981473, + "learning_rate": 4.841178543997329e-05, + "loss": 0.8827, + "step": 5397 + }, + { + "epoch": 2.4469628286491387, + "grad_norm": 0.3405159452648545, + "learning_rate": 4.839940931418002e-05, + "loss": 0.8602, + "step": 5398 + }, + { + "epoch": 2.4474161378059835, + "grad_norm": 0.3401262770106382, + "learning_rate": 4.838703234716122e-05, + "loss": 0.8857, + "step": 5399 + }, + { + "epoch": 2.447869446962829, + "grad_norm": 0.34992444030240655, + "learning_rate": 4.837465454015649e-05, + "loss": 0.852, + "step": 5400 + }, + { + "epoch": 2.4483227561196736, + "grad_norm": 0.3676772283001786, + "learning_rate": 4.8362275894405495e-05, + "loss": 0.8811, + "step": 5401 + }, + { + "epoch": 2.4487760652765185, + "grad_norm": 0.3456759166161624, + "learning_rate": 4.834989641114799e-05, + "loss": 0.8448, + "step": 5402 + }, + { + "epoch": 2.4492293744333633, + "grad_norm": 0.41459541169515207, + "learning_rate": 4.833751609162382e-05, + "loss": 0.8832, + "step": 5403 + }, + { + "epoch": 2.4496826835902086, + "grad_norm": 0.4688391009358986, + "learning_rate": 4.8325134937072904e-05, + "loss": 0.9057, + "step": 5404 + }, + { + "epoch": 2.4501359927470534, + "grad_norm": 0.5061400858029779, + "learning_rate": 4.831275294873527e-05, + "loss": 0.8746, + "step": 5405 + }, + { + "epoch": 2.4505893019038982, + "grad_norm": 0.505192950254817, + "learning_rate": 4.8300370127850984e-05, + "loss": 0.8698, + "step": 5406 + }, + { + "epoch": 2.4510426110607435, + "grad_norm": 0.4935309278024096, + "learning_rate": 4.828798647566024e-05, + "loss": 0.8602, + "step": 5407 + }, + { + "epoch": 2.4514959202175883, + "grad_norm": 0.37554617537535684, + "learning_rate": 4.827560199340329e-05, + "loss": 0.8821, + "step": 5408 + }, + { + "epoch": 2.451949229374433, + "grad_norm": 0.28190709710888806, + "learning_rate": 4.8263216682320485e-05, + "loss": 0.8752, + "step": 5409 + }, + { + "epoch": 2.4524025385312784, + "grad_norm": 0.2631669580783247, + "learning_rate": 4.825083054365224e-05, + "loss": 0.8818, + "step": 5410 + }, + { + "epoch": 2.4528558476881233, + "grad_norm": 0.3300651620037356, + "learning_rate": 4.823844357863907e-05, + "loss": 0.8855, + "step": 5411 + }, + { + "epoch": 2.453309156844968, + "grad_norm": 0.38895715828725225, + "learning_rate": 4.8226055788521564e-05, + "loss": 0.8782, + "step": 5412 + }, + { + "epoch": 2.4537624660018134, + "grad_norm": 0.3903585905537491, + "learning_rate": 4.8213667174540394e-05, + "loss": 0.8699, + "step": 5413 + }, + { + "epoch": 2.454215775158658, + "grad_norm": 0.4247089499700825, + "learning_rate": 4.820127773793631e-05, + "loss": 0.8992, + "step": 5414 + }, + { + "epoch": 2.454669084315503, + "grad_norm": 0.40510595909696406, + "learning_rate": 4.818888747995016e-05, + "loss": 0.867, + "step": 5415 + }, + { + "epoch": 2.4551223934723483, + "grad_norm": 0.42302791519979527, + "learning_rate": 4.817649640182286e-05, + "loss": 0.8823, + "step": 5416 + }, + { + "epoch": 2.455575702629193, + "grad_norm": 0.44567836538347566, + "learning_rate": 4.816410450479542e-05, + "loss": 0.8695, + "step": 5417 + }, + { + "epoch": 2.456029011786038, + "grad_norm": 0.3938033603033576, + "learning_rate": 4.815171179010893e-05, + "loss": 0.9063, + "step": 5418 + }, + { + "epoch": 2.456482320942883, + "grad_norm": 0.25421816336664455, + "learning_rate": 4.8139318259004526e-05, + "loss": 0.851, + "step": 5419 + }, + { + "epoch": 2.456935630099728, + "grad_norm": 0.2863806000042518, + "learning_rate": 4.81269239127235e-05, + "loss": 0.8722, + "step": 5420 + }, + { + "epoch": 2.457388939256573, + "grad_norm": 0.3649254161988491, + "learning_rate": 4.811452875250714e-05, + "loss": 0.8733, + "step": 5421 + }, + { + "epoch": 2.457842248413418, + "grad_norm": 0.3583679898444726, + "learning_rate": 4.810213277959689e-05, + "loss": 0.8901, + "step": 5422 + }, + { + "epoch": 2.458295557570263, + "grad_norm": 0.37969146290134514, + "learning_rate": 4.8089735995234227e-05, + "loss": 0.86, + "step": 5423 + }, + { + "epoch": 2.458748866727108, + "grad_norm": 0.310030674680303, + "learning_rate": 4.807733840066072e-05, + "loss": 0.8846, + "step": 5424 + }, + { + "epoch": 2.459202175883953, + "grad_norm": 1.5011075140711663, + "learning_rate": 4.806493999711804e-05, + "loss": 0.8832, + "step": 5425 + }, + { + "epoch": 2.459655485040798, + "grad_norm": 0.312930200426933, + "learning_rate": 4.805254078584791e-05, + "loss": 0.8851, + "step": 5426 + }, + { + "epoch": 2.4601087941976427, + "grad_norm": 0.44295252042800637, + "learning_rate": 4.8040140768092164e-05, + "loss": 0.883, + "step": 5427 + }, + { + "epoch": 2.460562103354488, + "grad_norm": 0.5438805813237402, + "learning_rate": 4.8027739945092685e-05, + "loss": 0.8675, + "step": 5428 + }, + { + "epoch": 2.461015412511333, + "grad_norm": 0.4953883978318142, + "learning_rate": 4.801533831809147e-05, + "loss": 0.8634, + "step": 5429 + }, + { + "epoch": 2.4614687216681777, + "grad_norm": 0.41036957985936284, + "learning_rate": 4.800293588833056e-05, + "loss": 0.8852, + "step": 5430 + }, + { + "epoch": 2.4619220308250225, + "grad_norm": 0.32706401460397166, + "learning_rate": 4.799053265705209e-05, + "loss": 0.8809, + "step": 5431 + }, + { + "epoch": 2.4623753399818678, + "grad_norm": 0.3676703315097909, + "learning_rate": 4.7978128625498294e-05, + "loss": 0.8776, + "step": 5432 + }, + { + "epoch": 2.4628286491387126, + "grad_norm": 0.5281786060397857, + "learning_rate": 4.796572379491148e-05, + "loss": 0.859, + "step": 5433 + }, + { + "epoch": 2.4632819582955574, + "grad_norm": 0.5748808348084197, + "learning_rate": 4.7953318166534006e-05, + "loss": 0.879, + "step": 5434 + }, + { + "epoch": 2.4637352674524027, + "grad_norm": 0.5439822147300265, + "learning_rate": 4.7940911741608344e-05, + "loss": 0.8844, + "step": 5435 + }, + { + "epoch": 2.4641885766092475, + "grad_norm": 0.4732155872194248, + "learning_rate": 4.792850452137704e-05, + "loss": 0.8725, + "step": 5436 + }, + { + "epoch": 2.4646418857660923, + "grad_norm": 0.44541560851412165, + "learning_rate": 4.79160965070827e-05, + "loss": 0.8604, + "step": 5437 + }, + { + "epoch": 2.4650951949229376, + "grad_norm": 0.42207221718055365, + "learning_rate": 4.7903687699968035e-05, + "loss": 0.8641, + "step": 5438 + }, + { + "epoch": 2.4655485040797824, + "grad_norm": 0.44645801462719004, + "learning_rate": 4.789127810127582e-05, + "loss": 0.8882, + "step": 5439 + }, + { + "epoch": 2.4660018132366273, + "grad_norm": 0.4646420383895577, + "learning_rate": 4.78788677122489e-05, + "loss": 0.8876, + "step": 5440 + }, + { + "epoch": 2.4664551223934725, + "grad_norm": 0.4679476779051183, + "learning_rate": 4.7866456534130235e-05, + "loss": 0.8899, + "step": 5441 + }, + { + "epoch": 2.4669084315503174, + "grad_norm": 0.2950357735465817, + "learning_rate": 4.785404456816282e-05, + "loss": 0.867, + "step": 5442 + }, + { + "epoch": 2.467361740707162, + "grad_norm": 0.3245051114115228, + "learning_rate": 4.784163181558976e-05, + "loss": 0.8839, + "step": 5443 + }, + { + "epoch": 2.467815049864007, + "grad_norm": 0.43474999744294446, + "learning_rate": 4.782921827765423e-05, + "loss": 0.8586, + "step": 5444 + }, + { + "epoch": 2.4682683590208523, + "grad_norm": 0.4369600093457574, + "learning_rate": 4.781680395559948e-05, + "loss": 0.8739, + "step": 5445 + }, + { + "epoch": 2.468721668177697, + "grad_norm": 0.3480210349125851, + "learning_rate": 4.7804388850668846e-05, + "loss": 0.8923, + "step": 5446 + }, + { + "epoch": 2.469174977334542, + "grad_norm": 0.2755103063838401, + "learning_rate": 4.779197296410573e-05, + "loss": 0.8637, + "step": 5447 + }, + { + "epoch": 2.469628286491387, + "grad_norm": 0.3281488805512811, + "learning_rate": 4.777955629715362e-05, + "loss": 0.876, + "step": 5448 + }, + { + "epoch": 2.470081595648232, + "grad_norm": 0.4167901063426073, + "learning_rate": 4.776713885105608e-05, + "loss": 0.8618, + "step": 5449 + }, + { + "epoch": 2.470534904805077, + "grad_norm": 0.5129828867980764, + "learning_rate": 4.7754720627056756e-05, + "loss": 0.8967, + "step": 5450 + }, + { + "epoch": 2.470988213961922, + "grad_norm": 0.4922354772385084, + "learning_rate": 4.774230162639938e-05, + "loss": 0.8706, + "step": 5451 + }, + { + "epoch": 2.471441523118767, + "grad_norm": 0.40424830764843783, + "learning_rate": 4.772988185032772e-05, + "loss": 0.9128, + "step": 5452 + }, + { + "epoch": 2.471894832275612, + "grad_norm": 0.3680625147931365, + "learning_rate": 4.77174613000857e-05, + "loss": 0.8558, + "step": 5453 + }, + { + "epoch": 2.472348141432457, + "grad_norm": 0.42874331181632, + "learning_rate": 4.770503997691723e-05, + "loss": 0.8751, + "step": 5454 + }, + { + "epoch": 2.472801450589302, + "grad_norm": 0.41775099228023677, + "learning_rate": 4.769261788206638e-05, + "loss": 0.8854, + "step": 5455 + }, + { + "epoch": 2.4732547597461467, + "grad_norm": 0.37562332751480626, + "learning_rate": 4.768019501677723e-05, + "loss": 0.8804, + "step": 5456 + }, + { + "epoch": 2.473708068902992, + "grad_norm": 0.35071809313048974, + "learning_rate": 4.7667771382293974e-05, + "loss": 0.8942, + "step": 5457 + }, + { + "epoch": 2.474161378059837, + "grad_norm": 0.30103676812697455, + "learning_rate": 4.765534697986088e-05, + "loss": 0.8646, + "step": 5458 + }, + { + "epoch": 2.4746146872166817, + "grad_norm": 0.293564168303737, + "learning_rate": 4.76429218107223e-05, + "loss": 0.8726, + "step": 5459 + }, + { + "epoch": 2.475067996373527, + "grad_norm": 0.3678766358004095, + "learning_rate": 4.763049587612263e-05, + "loss": 0.8717, + "step": 5460 + }, + { + "epoch": 2.4755213055303718, + "grad_norm": 0.44998499702672645, + "learning_rate": 4.761806917730637e-05, + "loss": 0.8771, + "step": 5461 + }, + { + "epoch": 2.4759746146872166, + "grad_norm": 0.5032551074736745, + "learning_rate": 4.76056417155181e-05, + "loss": 0.8602, + "step": 5462 + }, + { + "epoch": 2.476427923844062, + "grad_norm": 0.43766708010434824, + "learning_rate": 4.759321349200245e-05, + "loss": 0.87, + "step": 5463 + }, + { + "epoch": 2.4768812330009067, + "grad_norm": 0.3160371076103249, + "learning_rate": 4.7580784508004154e-05, + "loss": 0.8755, + "step": 5464 + }, + { + "epoch": 2.4773345421577515, + "grad_norm": 0.24990572304324826, + "learning_rate": 4.7568354764768014e-05, + "loss": 0.8598, + "step": 5465 + }, + { + "epoch": 2.477787851314597, + "grad_norm": 0.25947772495412774, + "learning_rate": 4.75559242635389e-05, + "loss": 0.8752, + "step": 5466 + }, + { + "epoch": 2.4782411604714416, + "grad_norm": 0.25761731341511684, + "learning_rate": 4.754349300556177e-05, + "loss": 0.8628, + "step": 5467 + }, + { + "epoch": 2.4786944696282864, + "grad_norm": 0.2625332158912322, + "learning_rate": 4.753106099208165e-05, + "loss": 0.868, + "step": 5468 + }, + { + "epoch": 2.4791477787851317, + "grad_norm": 0.3220258317203931, + "learning_rate": 4.751862822434362e-05, + "loss": 0.8802, + "step": 5469 + }, + { + "epoch": 2.4796010879419765, + "grad_norm": 0.3148641812760161, + "learning_rate": 4.7506194703592884e-05, + "loss": 0.8933, + "step": 5470 + }, + { + "epoch": 2.4800543970988214, + "grad_norm": 0.3024835784488438, + "learning_rate": 4.749376043107469e-05, + "loss": 0.8887, + "step": 5471 + }, + { + "epoch": 2.480507706255666, + "grad_norm": 0.2555799424149101, + "learning_rate": 4.748132540803436e-05, + "loss": 0.8691, + "step": 5472 + }, + { + "epoch": 2.4809610154125115, + "grad_norm": 0.22552656964622808, + "learning_rate": 4.746888963571731e-05, + "loss": 0.8647, + "step": 5473 + }, + { + "epoch": 2.4814143245693563, + "grad_norm": 0.2998076257992529, + "learning_rate": 4.745645311536901e-05, + "loss": 0.8665, + "step": 5474 + }, + { + "epoch": 2.481867633726201, + "grad_norm": 0.3469190766474229, + "learning_rate": 4.744401584823502e-05, + "loss": 0.9133, + "step": 5475 + }, + { + "epoch": 2.4823209428830464, + "grad_norm": 0.35497172980146496, + "learning_rate": 4.743157783556096e-05, + "loss": 0.8715, + "step": 5476 + }, + { + "epoch": 2.482774252039891, + "grad_norm": 0.3790858407956024, + "learning_rate": 4.741913907859255e-05, + "loss": 0.8861, + "step": 5477 + }, + { + "epoch": 2.483227561196736, + "grad_norm": 0.40301441974026253, + "learning_rate": 4.740669957857555e-05, + "loss": 0.8924, + "step": 5478 + }, + { + "epoch": 2.4836808703535813, + "grad_norm": 0.39336991003161526, + "learning_rate": 4.739425933675581e-05, + "loss": 0.8917, + "step": 5479 + }, + { + "epoch": 2.484134179510426, + "grad_norm": 0.38153371211692816, + "learning_rate": 4.738181835437927e-05, + "loss": 0.8899, + "step": 5480 + }, + { + "epoch": 2.484587488667271, + "grad_norm": 0.4126393039473719, + "learning_rate": 4.736937663269193e-05, + "loss": 0.8971, + "step": 5481 + }, + { + "epoch": 2.485040797824116, + "grad_norm": 0.45607865795598984, + "learning_rate": 4.735693417293986e-05, + "loss": 0.8778, + "step": 5482 + }, + { + "epoch": 2.485494106980961, + "grad_norm": 0.47118468013311854, + "learning_rate": 4.734449097636922e-05, + "loss": 0.8835, + "step": 5483 + }, + { + "epoch": 2.485947416137806, + "grad_norm": 0.45983145616839943, + "learning_rate": 4.733204704422621e-05, + "loss": 0.8619, + "step": 5484 + }, + { + "epoch": 2.4864007252946507, + "grad_norm": 0.3917821388370337, + "learning_rate": 4.731960237775715e-05, + "loss": 0.8849, + "step": 5485 + }, + { + "epoch": 2.486854034451496, + "grad_norm": 0.3489611217908264, + "learning_rate": 4.73071569782084e-05, + "loss": 0.8851, + "step": 5486 + }, + { + "epoch": 2.487307343608341, + "grad_norm": 0.3717532745514684, + "learning_rate": 4.72947108468264e-05, + "loss": 0.8737, + "step": 5487 + }, + { + "epoch": 2.4877606527651857, + "grad_norm": 0.34136444150808626, + "learning_rate": 4.728226398485767e-05, + "loss": 0.8751, + "step": 5488 + }, + { + "epoch": 2.488213961922031, + "grad_norm": 0.39439871052056846, + "learning_rate": 4.726981639354879e-05, + "loss": 0.8973, + "step": 5489 + }, + { + "epoch": 2.4886672710788758, + "grad_norm": 0.3911914411095558, + "learning_rate": 4.725736807414645e-05, + "loss": 0.8807, + "step": 5490 + }, + { + "epoch": 2.4891205802357206, + "grad_norm": 0.3717439019880251, + "learning_rate": 4.724491902789736e-05, + "loss": 0.876, + "step": 5491 + }, + { + "epoch": 2.489573889392566, + "grad_norm": 0.30808931967290365, + "learning_rate": 4.723246925604834e-05, + "loss": 0.8831, + "step": 5492 + }, + { + "epoch": 2.4900271985494107, + "grad_norm": 0.29071206896601326, + "learning_rate": 4.722001875984626e-05, + "loss": 0.8769, + "step": 5493 + }, + { + "epoch": 2.4904805077062555, + "grad_norm": 0.2679571443367385, + "learning_rate": 4.720756754053809e-05, + "loss": 0.8663, + "step": 5494 + }, + { + "epoch": 2.490933816863101, + "grad_norm": 0.30132822633506945, + "learning_rate": 4.7195115599370846e-05, + "loss": 0.8805, + "step": 5495 + }, + { + "epoch": 2.4913871260199456, + "grad_norm": 0.3110757895295651, + "learning_rate": 4.718266293759163e-05, + "loss": 0.8818, + "step": 5496 + }, + { + "epoch": 2.4918404351767904, + "grad_norm": 0.4133104171474226, + "learning_rate": 4.717020955644762e-05, + "loss": 0.866, + "step": 5497 + }, + { + "epoch": 2.4922937443336357, + "grad_norm": 0.5135775580308751, + "learning_rate": 4.715775545718603e-05, + "loss": 0.8738, + "step": 5498 + }, + { + "epoch": 2.4927470534904805, + "grad_norm": 0.5318109087931773, + "learning_rate": 4.7145300641054197e-05, + "loss": 0.8877, + "step": 5499 + }, + { + "epoch": 2.4932003626473254, + "grad_norm": 0.5533407222555866, + "learning_rate": 4.713284510929952e-05, + "loss": 0.8814, + "step": 5500 + }, + { + "epoch": 2.4936536718041706, + "grad_norm": 0.5875661873525808, + "learning_rate": 4.712038886316944e-05, + "loss": 0.8766, + "step": 5501 + }, + { + "epoch": 2.4941069809610155, + "grad_norm": 0.6639789916128458, + "learning_rate": 4.7107931903911475e-05, + "loss": 0.8826, + "step": 5502 + }, + { + "epoch": 2.4945602901178603, + "grad_norm": 0.5821591436548675, + "learning_rate": 4.709547423277325e-05, + "loss": 0.8675, + "step": 5503 + }, + { + "epoch": 2.4950135992747056, + "grad_norm": 0.4505835361873875, + "learning_rate": 4.708301585100243e-05, + "loss": 0.8646, + "step": 5504 + }, + { + "epoch": 2.4954669084315504, + "grad_norm": 0.29680770578957855, + "learning_rate": 4.7070556759846764e-05, + "loss": 0.8658, + "step": 5505 + }, + { + "epoch": 2.495920217588395, + "grad_norm": 0.2998148001002336, + "learning_rate": 4.705809696055404e-05, + "loss": 0.9021, + "step": 5506 + }, + { + "epoch": 2.4963735267452405, + "grad_norm": 0.41677096812859243, + "learning_rate": 4.704563645437217e-05, + "loss": 0.8825, + "step": 5507 + }, + { + "epoch": 2.4968268359020853, + "grad_norm": 0.5364753342296026, + "learning_rate": 4.7033175242549105e-05, + "loss": 0.8808, + "step": 5508 + }, + { + "epoch": 2.49728014505893, + "grad_norm": 0.628234727118295, + "learning_rate": 4.702071332633286e-05, + "loss": 0.8505, + "step": 5509 + }, + { + "epoch": 2.497733454215775, + "grad_norm": 0.6002009912184146, + "learning_rate": 4.700825070697154e-05, + "loss": 0.8816, + "step": 5510 + }, + { + "epoch": 2.4981867633726202, + "grad_norm": 0.5461104230143402, + "learning_rate": 4.6995787385713325e-05, + "loss": 0.8822, + "step": 5511 + }, + { + "epoch": 2.498640072529465, + "grad_norm": 0.5037346946584944, + "learning_rate": 4.6983323363806435e-05, + "loss": 0.8681, + "step": 5512 + }, + { + "epoch": 2.49909338168631, + "grad_norm": 0.44658140088784437, + "learning_rate": 4.6970858642499196e-05, + "loss": 0.8556, + "step": 5513 + }, + { + "epoch": 2.499546690843155, + "grad_norm": 0.3862959815684738, + "learning_rate": 4.6958393223039965e-05, + "loss": 0.8736, + "step": 5514 + }, + { + "epoch": 2.5, + "grad_norm": 0.38119942791454575, + "learning_rate": 4.694592710667723e-05, + "loss": 0.8811, + "step": 5515 + }, + { + "epoch": 2.500453309156845, + "grad_norm": 0.36826897631871647, + "learning_rate": 4.693346029465945e-05, + "loss": 0.8882, + "step": 5516 + }, + { + "epoch": 2.50090661831369, + "grad_norm": 0.3405490447380599, + "learning_rate": 4.692099278823525e-05, + "loss": 0.8742, + "step": 5517 + }, + { + "epoch": 2.501359927470535, + "grad_norm": 0.39405322381910607, + "learning_rate": 4.69085245886533e-05, + "loss": 0.8954, + "step": 5518 + }, + { + "epoch": 2.5018132366273798, + "grad_norm": 0.40699448101099345, + "learning_rate": 4.6896055697162295e-05, + "loss": 0.8741, + "step": 5519 + }, + { + "epoch": 2.5022665457842246, + "grad_norm": 0.3886812539138604, + "learning_rate": 4.688358611501104e-05, + "loss": 0.8969, + "step": 5520 + }, + { + "epoch": 2.50271985494107, + "grad_norm": 0.3225494889114954, + "learning_rate": 4.687111584344841e-05, + "loss": 0.8846, + "step": 5521 + }, + { + "epoch": 2.5031731640979147, + "grad_norm": 0.34464237842229317, + "learning_rate": 4.685864488372334e-05, + "loss": 0.8627, + "step": 5522 + }, + { + "epoch": 2.5036264732547595, + "grad_norm": 0.3536008331243863, + "learning_rate": 4.684617323708482e-05, + "loss": 0.872, + "step": 5523 + }, + { + "epoch": 2.504079782411605, + "grad_norm": 0.3630754404561008, + "learning_rate": 4.683370090478193e-05, + "loss": 0.8741, + "step": 5524 + }, + { + "epoch": 2.5045330915684496, + "grad_norm": 0.4737337554348843, + "learning_rate": 4.682122788806383e-05, + "loss": 0.8921, + "step": 5525 + }, + { + "epoch": 2.5049864007252944, + "grad_norm": 0.4261692111319818, + "learning_rate": 4.6808754188179685e-05, + "loss": 0.8865, + "step": 5526 + }, + { + "epoch": 2.5054397098821397, + "grad_norm": 0.5070187530666659, + "learning_rate": 4.679627980637881e-05, + "loss": 0.8937, + "step": 5527 + }, + { + "epoch": 2.5058930190389845, + "grad_norm": 0.4031330180568247, + "learning_rate": 4.678380474391053e-05, + "loss": 0.8759, + "step": 5528 + }, + { + "epoch": 2.5063463281958294, + "grad_norm": 0.3945110724343244, + "learning_rate": 4.677132900202427e-05, + "loss": 0.8974, + "step": 5529 + }, + { + "epoch": 2.5067996373526746, + "grad_norm": 0.3551785048687741, + "learning_rate": 4.675885258196952e-05, + "loss": 0.9008, + "step": 5530 + }, + { + "epoch": 2.5072529465095195, + "grad_norm": 0.39029555094330204, + "learning_rate": 4.674637548499581e-05, + "loss": 0.8747, + "step": 5531 + }, + { + "epoch": 2.5077062556663643, + "grad_norm": 0.39420885741663986, + "learning_rate": 4.673389771235277e-05, + "loss": 0.8768, + "step": 5532 + }, + { + "epoch": 2.5081595648232096, + "grad_norm": 0.33025379409806477, + "learning_rate": 4.6721419265290084e-05, + "loss": 0.8845, + "step": 5533 + }, + { + "epoch": 2.5086128739800544, + "grad_norm": 0.30127496197978565, + "learning_rate": 4.6708940145057515e-05, + "loss": 0.8898, + "step": 5534 + }, + { + "epoch": 2.509066183136899, + "grad_norm": 0.5433875325828915, + "learning_rate": 4.669646035290486e-05, + "loss": 0.8973, + "step": 5535 + }, + { + "epoch": 2.5095194922937445, + "grad_norm": 0.41300550016795223, + "learning_rate": 4.668397989008202e-05, + "loss": 0.8613, + "step": 5536 + }, + { + "epoch": 2.5099728014505893, + "grad_norm": 0.30808650016656136, + "learning_rate": 4.667149875783895e-05, + "loss": 0.8813, + "step": 5537 + }, + { + "epoch": 2.510426110607434, + "grad_norm": 0.32639035339028305, + "learning_rate": 4.665901695742568e-05, + "loss": 0.8996, + "step": 5538 + }, + { + "epoch": 2.5108794197642794, + "grad_norm": 0.30673523523699725, + "learning_rate": 4.664653449009228e-05, + "loss": 0.8595, + "step": 5539 + }, + { + "epoch": 2.5113327289211242, + "grad_norm": 0.36308834244712135, + "learning_rate": 4.6634051357088925e-05, + "loss": 0.8873, + "step": 5540 + }, + { + "epoch": 2.511786038077969, + "grad_norm": 0.35675671574436957, + "learning_rate": 4.662156755966582e-05, + "loss": 0.8619, + "step": 5541 + }, + { + "epoch": 2.5122393472348143, + "grad_norm": 0.3436511527610942, + "learning_rate": 4.660908309907328e-05, + "loss": 0.8845, + "step": 5542 + }, + { + "epoch": 2.512692656391659, + "grad_norm": 0.3244206907182932, + "learning_rate": 4.6596597976561624e-05, + "loss": 0.8739, + "step": 5543 + }, + { + "epoch": 2.513145965548504, + "grad_norm": 0.329695856707433, + "learning_rate": 4.6584112193381314e-05, + "loss": 0.8727, + "step": 5544 + }, + { + "epoch": 2.5135992747053493, + "grad_norm": 0.34081478288744177, + "learning_rate": 4.657162575078281e-05, + "loss": 0.8898, + "step": 5545 + }, + { + "epoch": 2.514052583862194, + "grad_norm": 0.34989283637979945, + "learning_rate": 4.655913865001667e-05, + "loss": 0.8767, + "step": 5546 + }, + { + "epoch": 2.514505893019039, + "grad_norm": 0.34234840844094755, + "learning_rate": 4.6546650892333526e-05, + "loss": 0.8836, + "step": 5547 + }, + { + "epoch": 2.514959202175884, + "grad_norm": 0.34897618193555013, + "learning_rate": 4.653416247898404e-05, + "loss": 0.8887, + "step": 5548 + }, + { + "epoch": 2.515412511332729, + "grad_norm": 0.296993339775738, + "learning_rate": 4.6521673411219e-05, + "loss": 0.8834, + "step": 5549 + }, + { + "epoch": 2.515865820489574, + "grad_norm": 0.2955291006064988, + "learning_rate": 4.650918369028918e-05, + "loss": 0.8945, + "step": 5550 + }, + { + "epoch": 2.516319129646419, + "grad_norm": 0.2959504008900902, + "learning_rate": 4.649669331744548e-05, + "loss": 0.8531, + "step": 5551 + }, + { + "epoch": 2.516772438803264, + "grad_norm": 0.28612470879017154, + "learning_rate": 4.648420229393886e-05, + "loss": 0.8698, + "step": 5552 + }, + { + "epoch": 2.517225747960109, + "grad_norm": 0.28997928422845576, + "learning_rate": 4.647171062102034e-05, + "loss": 0.8821, + "step": 5553 + }, + { + "epoch": 2.5176790571169536, + "grad_norm": 0.28968108489190825, + "learning_rate": 4.6459218299940967e-05, + "loss": 0.8813, + "step": 5554 + }, + { + "epoch": 2.518132366273799, + "grad_norm": 0.2676665565152899, + "learning_rate": 4.6446725331951885e-05, + "loss": 0.8771, + "step": 5555 + }, + { + "epoch": 2.5185856754306437, + "grad_norm": 0.24592000614192072, + "learning_rate": 4.643423171830433e-05, + "loss": 0.8723, + "step": 5556 + }, + { + "epoch": 2.5190389845874885, + "grad_norm": 0.26428602248608424, + "learning_rate": 4.642173746024956e-05, + "loss": 0.8762, + "step": 5557 + }, + { + "epoch": 2.5194922937443334, + "grad_norm": 0.284782028266064, + "learning_rate": 4.64092425590389e-05, + "loss": 0.865, + "step": 5558 + }, + { + "epoch": 2.5199456029011786, + "grad_norm": 0.3428433157612863, + "learning_rate": 4.639674701592376e-05, + "loss": 0.867, + "step": 5559 + }, + { + "epoch": 2.5203989120580235, + "grad_norm": 0.4431863878290127, + "learning_rate": 4.6384250832155626e-05, + "loss": 0.8937, + "step": 5560 + }, + { + "epoch": 2.5208522212148683, + "grad_norm": 0.4036896979817021, + "learning_rate": 4.637175400898599e-05, + "loss": 0.855, + "step": 5561 + }, + { + "epoch": 2.5213055303717136, + "grad_norm": 0.3582166803819985, + "learning_rate": 4.635925654766647e-05, + "loss": 0.8693, + "step": 5562 + }, + { + "epoch": 2.5217588395285584, + "grad_norm": 0.35901143231009336, + "learning_rate": 4.6346758449448726e-05, + "loss": 0.8807, + "step": 5563 + }, + { + "epoch": 2.522212148685403, + "grad_norm": 0.382698198700608, + "learning_rate": 4.6334259715584454e-05, + "loss": 0.8623, + "step": 5564 + }, + { + "epoch": 2.5226654578422485, + "grad_norm": 0.3285743419884047, + "learning_rate": 4.6321760347325465e-05, + "loss": 0.8808, + "step": 5565 + }, + { + "epoch": 2.5231187669990933, + "grad_norm": 0.3131369475548153, + "learning_rate": 4.630926034592359e-05, + "loss": 0.8964, + "step": 5566 + }, + { + "epoch": 2.523572076155938, + "grad_norm": 0.373686441087009, + "learning_rate": 4.6296759712630755e-05, + "loss": 0.8502, + "step": 5567 + }, + { + "epoch": 2.5240253853127834, + "grad_norm": 0.3791472264793105, + "learning_rate": 4.6284258448698924e-05, + "loss": 0.8559, + "step": 5568 + }, + { + "epoch": 2.5244786944696282, + "grad_norm": 0.41782124682304594, + "learning_rate": 4.627175655538015e-05, + "loss": 0.8758, + "step": 5569 + }, + { + "epoch": 2.524932003626473, + "grad_norm": 0.4210177110629557, + "learning_rate": 4.6259254033926516e-05, + "loss": 0.8685, + "step": 5570 + }, + { + "epoch": 2.5253853127833183, + "grad_norm": 0.4232857505938946, + "learning_rate": 4.624675088559019e-05, + "loss": 0.9063, + "step": 5571 + }, + { + "epoch": 2.525838621940163, + "grad_norm": 0.48432957743782346, + "learning_rate": 4.623424711162342e-05, + "loss": 0.8755, + "step": 5572 + }, + { + "epoch": 2.526291931097008, + "grad_norm": 0.47967696248391206, + "learning_rate": 4.622174271327847e-05, + "loss": 0.8728, + "step": 5573 + }, + { + "epoch": 2.5267452402538533, + "grad_norm": 0.432739144460773, + "learning_rate": 4.620923769180771e-05, + "loss": 0.8828, + "step": 5574 + }, + { + "epoch": 2.527198549410698, + "grad_norm": 0.3608612249399145, + "learning_rate": 4.6196732048463544e-05, + "loss": 0.8705, + "step": 5575 + }, + { + "epoch": 2.527651858567543, + "grad_norm": 0.38469975183438887, + "learning_rate": 4.618422578449845e-05, + "loss": 0.862, + "step": 5576 + }, + { + "epoch": 2.528105167724388, + "grad_norm": 0.41769118796432264, + "learning_rate": 4.617171890116495e-05, + "loss": 0.8672, + "step": 5577 + }, + { + "epoch": 2.528558476881233, + "grad_norm": 0.4622651361591734, + "learning_rate": 4.615921139971569e-05, + "loss": 0.8859, + "step": 5578 + }, + { + "epoch": 2.529011786038078, + "grad_norm": 0.47186903918049644, + "learning_rate": 4.614670328140329e-05, + "loss": 0.8727, + "step": 5579 + }, + { + "epoch": 2.529465095194923, + "grad_norm": 0.4169168798427771, + "learning_rate": 4.61341945474805e-05, + "loss": 0.8631, + "step": 5580 + }, + { + "epoch": 2.529918404351768, + "grad_norm": 0.4754082461698161, + "learning_rate": 4.6121685199200094e-05, + "loss": 0.8815, + "step": 5581 + }, + { + "epoch": 2.530371713508613, + "grad_norm": 0.6112798373888094, + "learning_rate": 4.610917523781493e-05, + "loss": 0.8786, + "step": 5582 + }, + { + "epoch": 2.530825022665458, + "grad_norm": 0.6819981117926267, + "learning_rate": 4.609666466457791e-05, + "loss": 0.8795, + "step": 5583 + }, + { + "epoch": 2.531278331822303, + "grad_norm": 0.6127135309259948, + "learning_rate": 4.6084153480742e-05, + "loss": 0.8641, + "step": 5584 + }, + { + "epoch": 2.5317316409791477, + "grad_norm": 0.5289093731246506, + "learning_rate": 4.607164168756024e-05, + "loss": 0.885, + "step": 5585 + }, + { + "epoch": 2.532184950135993, + "grad_norm": 0.5376356447856495, + "learning_rate": 4.605912928628572e-05, + "loss": 0.8729, + "step": 5586 + }, + { + "epoch": 2.532638259292838, + "grad_norm": 0.5581348661960093, + "learning_rate": 4.6046616278171594e-05, + "loss": 0.8895, + "step": 5587 + }, + { + "epoch": 2.5330915684496826, + "grad_norm": 0.5920086191151377, + "learning_rate": 4.603410266447109e-05, + "loss": 0.8749, + "step": 5588 + }, + { + "epoch": 2.533544877606528, + "grad_norm": 0.5826138046888624, + "learning_rate": 4.602158844643746e-05, + "loss": 0.8744, + "step": 5589 + }, + { + "epoch": 2.5339981867633727, + "grad_norm": 0.45993416443986374, + "learning_rate": 4.600907362532405e-05, + "loss": 0.8813, + "step": 5590 + }, + { + "epoch": 2.5344514959202176, + "grad_norm": 0.3810068820748897, + "learning_rate": 4.5996558202384256e-05, + "loss": 0.9037, + "step": 5591 + }, + { + "epoch": 2.5349048050770624, + "grad_norm": 0.3671513776779765, + "learning_rate": 4.598404217887153e-05, + "loss": 0.8569, + "step": 5592 + }, + { + "epoch": 2.5353581142339077, + "grad_norm": 0.37105173979275385, + "learning_rate": 4.597152555603939e-05, + "loss": 0.8828, + "step": 5593 + }, + { + "epoch": 2.5358114233907525, + "grad_norm": 0.46734933048562327, + "learning_rate": 4.595900833514144e-05, + "loss": 0.8655, + "step": 5594 + }, + { + "epoch": 2.5362647325475973, + "grad_norm": 0.40118874275083755, + "learning_rate": 4.594649051743126e-05, + "loss": 0.8682, + "step": 5595 + }, + { + "epoch": 2.5367180417044426, + "grad_norm": 0.3717522587017892, + "learning_rate": 4.59339721041626e-05, + "loss": 0.8844, + "step": 5596 + }, + { + "epoch": 2.5371713508612874, + "grad_norm": 0.47801243039621416, + "learning_rate": 4.592145309658918e-05, + "loss": 0.8754, + "step": 5597 + }, + { + "epoch": 2.5376246600181322, + "grad_norm": 0.41322232942674636, + "learning_rate": 4.590893349596484e-05, + "loss": 0.8619, + "step": 5598 + }, + { + "epoch": 2.538077969174977, + "grad_norm": 0.35177219801734044, + "learning_rate": 4.589641330354344e-05, + "loss": 0.8666, + "step": 5599 + }, + { + "epoch": 2.5385312783318223, + "grad_norm": 0.381099929915266, + "learning_rate": 4.5883892520578915e-05, + "loss": 0.868, + "step": 5600 + }, + { + "epoch": 2.538984587488667, + "grad_norm": 0.35251671318928646, + "learning_rate": 4.587137114832528e-05, + "loss": 0.89, + "step": 5601 + }, + { + "epoch": 2.539437896645512, + "grad_norm": 0.3398903589334157, + "learning_rate": 4.585884918803655e-05, + "loss": 0.8644, + "step": 5602 + }, + { + "epoch": 2.5398912058023573, + "grad_norm": 0.3072295729122076, + "learning_rate": 4.5846326640966866e-05, + "loss": 0.8834, + "step": 5603 + }, + { + "epoch": 2.540344514959202, + "grad_norm": 0.3011833399784324, + "learning_rate": 4.583380350837038e-05, + "loss": 0.8887, + "step": 5604 + }, + { + "epoch": 2.540797824116047, + "grad_norm": 0.3141466562286501, + "learning_rate": 4.582127979150133e-05, + "loss": 0.8502, + "step": 5605 + }, + { + "epoch": 2.541251133272892, + "grad_norm": 0.2972282853811104, + "learning_rate": 4.580875549161399e-05, + "loss": 0.8846, + "step": 5606 + }, + { + "epoch": 2.541704442429737, + "grad_norm": 0.364804168259479, + "learning_rate": 4.579623060996273e-05, + "loss": 0.8911, + "step": 5607 + }, + { + "epoch": 2.542157751586582, + "grad_norm": 0.4113200626744191, + "learning_rate": 4.578370514780194e-05, + "loss": 0.8729, + "step": 5608 + }, + { + "epoch": 2.542611060743427, + "grad_norm": 0.3805955860557126, + "learning_rate": 4.577117910638607e-05, + "loss": 0.8782, + "step": 5609 + }, + { + "epoch": 2.543064369900272, + "grad_norm": 0.39457328302743366, + "learning_rate": 4.575865248696965e-05, + "loss": 0.8656, + "step": 5610 + }, + { + "epoch": 2.543517679057117, + "grad_norm": 0.4767289801118861, + "learning_rate": 4.5746125290807276e-05, + "loss": 0.8753, + "step": 5611 + }, + { + "epoch": 2.543970988213962, + "grad_norm": 0.5276293177115423, + "learning_rate": 4.573359751915355e-05, + "loss": 0.856, + "step": 5612 + }, + { + "epoch": 2.544424297370807, + "grad_norm": 0.47140701296511245, + "learning_rate": 4.572106917326319e-05, + "loss": 0.8808, + "step": 5613 + }, + { + "epoch": 2.5448776065276517, + "grad_norm": 0.48960302561675095, + "learning_rate": 4.570854025439094e-05, + "loss": 0.8721, + "step": 5614 + }, + { + "epoch": 2.545330915684497, + "grad_norm": 0.5162647392474662, + "learning_rate": 4.5696010763791596e-05, + "loss": 0.8904, + "step": 5615 + }, + { + "epoch": 2.545784224841342, + "grad_norm": 0.48539862142564777, + "learning_rate": 4.568348070272005e-05, + "loss": 0.8742, + "step": 5616 + }, + { + "epoch": 2.5462375339981866, + "grad_norm": 0.40860945710613555, + "learning_rate": 4.5670950072431204e-05, + "loss": 0.8734, + "step": 5617 + }, + { + "epoch": 2.546690843155032, + "grad_norm": 0.41603439589758817, + "learning_rate": 4.5658418874180034e-05, + "loss": 0.8733, + "step": 5618 + }, + { + "epoch": 2.5471441523118767, + "grad_norm": 0.4848041001218365, + "learning_rate": 4.5645887109221586e-05, + "loss": 0.8807, + "step": 5619 + }, + { + "epoch": 2.5475974614687216, + "grad_norm": 0.5653492268288695, + "learning_rate": 4.5633354778810945e-05, + "loss": 0.909, + "step": 5620 + }, + { + "epoch": 2.548050770625567, + "grad_norm": 0.5163107369991693, + "learning_rate": 4.562082188420329e-05, + "loss": 0.8868, + "step": 5621 + }, + { + "epoch": 2.5485040797824117, + "grad_norm": 0.4540242841622267, + "learning_rate": 4.560828842665378e-05, + "loss": 0.8635, + "step": 5622 + }, + { + "epoch": 2.5489573889392565, + "grad_norm": 0.40369205314080436, + "learning_rate": 4.5595754407417714e-05, + "loss": 0.8711, + "step": 5623 + }, + { + "epoch": 2.5494106980961018, + "grad_norm": 0.44968396158293994, + "learning_rate": 4.558321982775039e-05, + "loss": 0.8646, + "step": 5624 + }, + { + "epoch": 2.5498640072529466, + "grad_norm": 0.4974738944636876, + "learning_rate": 4.557068468890719e-05, + "loss": 0.8808, + "step": 5625 + }, + { + "epoch": 2.5503173164097914, + "grad_norm": 0.43225671578602665, + "learning_rate": 4.5558148992143547e-05, + "loss": 0.8661, + "step": 5626 + }, + { + "epoch": 2.5507706255666367, + "grad_norm": 0.3225112335928232, + "learning_rate": 4.5545612738714956e-05, + "loss": 0.8783, + "step": 5627 + }, + { + "epoch": 2.5512239347234815, + "grad_norm": 0.37057779623501735, + "learning_rate": 4.553307592987694e-05, + "loss": 0.8805, + "step": 5628 + }, + { + "epoch": 2.5516772438803264, + "grad_norm": 0.46250146260412855, + "learning_rate": 4.5520538566885124e-05, + "loss": 0.8689, + "step": 5629 + }, + { + "epoch": 2.5521305530371716, + "grad_norm": 0.5120490177079945, + "learning_rate": 4.5508000650995146e-05, + "loss": 0.8784, + "step": 5630 + }, + { + "epoch": 2.5525838621940165, + "grad_norm": 0.5520212654032604, + "learning_rate": 4.5495462183462696e-05, + "loss": 0.858, + "step": 5631 + }, + { + "epoch": 2.5530371713508613, + "grad_norm": 0.5785633523185959, + "learning_rate": 4.548292316554357e-05, + "loss": 0.878, + "step": 5632 + }, + { + "epoch": 2.553490480507706, + "grad_norm": 0.4627684624490426, + "learning_rate": 4.5470383598493576e-05, + "loss": 0.8395, + "step": 5633 + }, + { + "epoch": 2.5539437896645514, + "grad_norm": 0.4075831183523659, + "learning_rate": 4.5457843483568583e-05, + "loss": 0.8659, + "step": 5634 + }, + { + "epoch": 2.554397098821396, + "grad_norm": 0.357728844887925, + "learning_rate": 4.5445302822024525e-05, + "loss": 0.8761, + "step": 5635 + }, + { + "epoch": 2.554850407978241, + "grad_norm": 0.30264772376768884, + "learning_rate": 4.543276161511739e-05, + "loss": 0.8862, + "step": 5636 + }, + { + "epoch": 2.555303717135086, + "grad_norm": 0.2812894123072208, + "learning_rate": 4.542021986410321e-05, + "loss": 0.8652, + "step": 5637 + }, + { + "epoch": 2.555757026291931, + "grad_norm": 0.4269714918375115, + "learning_rate": 4.540767757023808e-05, + "loss": 0.8848, + "step": 5638 + }, + { + "epoch": 2.556210335448776, + "grad_norm": 0.4141229983561375, + "learning_rate": 4.539513473477814e-05, + "loss": 0.8867, + "step": 5639 + }, + { + "epoch": 2.556663644605621, + "grad_norm": 0.42824640985435747, + "learning_rate": 4.538259135897962e-05, + "loss": 0.893, + "step": 5640 + }, + { + "epoch": 2.557116953762466, + "grad_norm": 0.40578639215359136, + "learning_rate": 4.537004744409874e-05, + "loss": 0.873, + "step": 5641 + }, + { + "epoch": 2.557570262919311, + "grad_norm": 0.4541550316067125, + "learning_rate": 4.535750299139183e-05, + "loss": 0.8731, + "step": 5642 + }, + { + "epoch": 2.5580235720761557, + "grad_norm": 0.483508808114005, + "learning_rate": 4.534495800211526e-05, + "loss": 0.8782, + "step": 5643 + }, + { + "epoch": 2.558476881233001, + "grad_norm": 0.4468550324494443, + "learning_rate": 4.533241247752542e-05, + "loss": 0.8839, + "step": 5644 + }, + { + "epoch": 2.558930190389846, + "grad_norm": 0.3755064520468057, + "learning_rate": 4.531986641887882e-05, + "loss": 0.9068, + "step": 5645 + }, + { + "epoch": 2.5593834995466906, + "grad_norm": 0.3476275036756545, + "learning_rate": 4.530731982743195e-05, + "loss": 0.8773, + "step": 5646 + }, + { + "epoch": 2.559836808703536, + "grad_norm": 0.34314917695409686, + "learning_rate": 4.52947727044414e-05, + "loss": 0.8912, + "step": 5647 + }, + { + "epoch": 2.5602901178603807, + "grad_norm": 0.31070834590444446, + "learning_rate": 4.5282225051163805e-05, + "loss": 0.8646, + "step": 5648 + }, + { + "epoch": 2.5607434270172256, + "grad_norm": 0.2929677273066068, + "learning_rate": 4.526967686885585e-05, + "loss": 0.8816, + "step": 5649 + }, + { + "epoch": 2.561196736174071, + "grad_norm": 0.2696127611922488, + "learning_rate": 4.525712815877427e-05, + "loss": 0.8675, + "step": 5650 + }, + { + "epoch": 2.5616500453309157, + "grad_norm": 0.30711522191219026, + "learning_rate": 4.5244578922175846e-05, + "loss": 0.8729, + "step": 5651 + }, + { + "epoch": 2.5621033544877605, + "grad_norm": 0.34052851852907684, + "learning_rate": 4.523202916031743e-05, + "loss": 0.8713, + "step": 5652 + }, + { + "epoch": 2.5625566636446058, + "grad_norm": 0.337249068867083, + "learning_rate": 4.521947887445592e-05, + "loss": 0.8844, + "step": 5653 + }, + { + "epoch": 2.5630099728014506, + "grad_norm": 0.33644447523440796, + "learning_rate": 4.5206928065848266e-05, + "loss": 0.8948, + "step": 5654 + }, + { + "epoch": 2.5634632819582954, + "grad_norm": 0.40899946330440345, + "learning_rate": 4.5194376735751456e-05, + "loss": 0.8699, + "step": 5655 + }, + { + "epoch": 2.5639165911151407, + "grad_norm": 0.44467500840305957, + "learning_rate": 4.5181824885422555e-05, + "loss": 0.8906, + "step": 5656 + }, + { + "epoch": 2.5643699002719855, + "grad_norm": 0.45367311666123666, + "learning_rate": 4.516927251611866e-05, + "loss": 0.8892, + "step": 5657 + }, + { + "epoch": 2.5648232094288304, + "grad_norm": 0.3927187337310338, + "learning_rate": 4.515671962909694e-05, + "loss": 0.8555, + "step": 5658 + }, + { + "epoch": 2.5652765185856756, + "grad_norm": 0.3640283912053851, + "learning_rate": 4.514416622561458e-05, + "loss": 0.8812, + "step": 5659 + }, + { + "epoch": 2.5657298277425205, + "grad_norm": 0.4836503718470184, + "learning_rate": 4.5131612306928866e-05, + "loss": 0.8474, + "step": 5660 + }, + { + "epoch": 2.5661831368993653, + "grad_norm": 0.7336233288049114, + "learning_rate": 4.511905787429709e-05, + "loss": 0.8856, + "step": 5661 + }, + { + "epoch": 2.5666364460562106, + "grad_norm": 0.9010918093876618, + "learning_rate": 4.510650292897662e-05, + "loss": 0.9127, + "step": 5662 + }, + { + "epoch": 2.5670897552130554, + "grad_norm": 0.770063032526249, + "learning_rate": 4.509394747222488e-05, + "loss": 0.8849, + "step": 5663 + }, + { + "epoch": 2.5675430643699, + "grad_norm": 0.8794326844389323, + "learning_rate": 4.508139150529933e-05, + "loss": 0.8833, + "step": 5664 + }, + { + "epoch": 2.5679963735267455, + "grad_norm": 0.8023770200580465, + "learning_rate": 4.5068835029457475e-05, + "loss": 0.8731, + "step": 5665 + }, + { + "epoch": 2.5684496826835903, + "grad_norm": 0.7334277932828152, + "learning_rate": 4.50562780459569e-05, + "loss": 0.8749, + "step": 5666 + }, + { + "epoch": 2.568902991840435, + "grad_norm": 0.6471371707056286, + "learning_rate": 4.5043720556055234e-05, + "loss": 0.8879, + "step": 5667 + }, + { + "epoch": 2.5693563009972804, + "grad_norm": 0.5773864789366067, + "learning_rate": 4.5031162561010114e-05, + "loss": 0.8742, + "step": 5668 + }, + { + "epoch": 2.5698096101541252, + "grad_norm": 0.48063133253172535, + "learning_rate": 4.5018604062079296e-05, + "loss": 0.9044, + "step": 5669 + }, + { + "epoch": 2.57026291931097, + "grad_norm": 0.3008489371816648, + "learning_rate": 4.5006045060520516e-05, + "loss": 0.8769, + "step": 5670 + }, + { + "epoch": 2.570716228467815, + "grad_norm": 0.3696237080773799, + "learning_rate": 4.499348555759161e-05, + "loss": 0.8763, + "step": 5671 + }, + { + "epoch": 2.57116953762466, + "grad_norm": 0.5180835321680161, + "learning_rate": 4.4980925554550455e-05, + "loss": 0.8778, + "step": 5672 + }, + { + "epoch": 2.571622846781505, + "grad_norm": 0.5110341100924983, + "learning_rate": 4.4968365052654975e-05, + "loss": 0.8701, + "step": 5673 + }, + { + "epoch": 2.57207615593835, + "grad_norm": 0.504458570988877, + "learning_rate": 4.495580405316312e-05, + "loss": 0.8904, + "step": 5674 + }, + { + "epoch": 2.572529465095195, + "grad_norm": 0.49763418682044586, + "learning_rate": 4.4943242557332936e-05, + "loss": 0.9019, + "step": 5675 + }, + { + "epoch": 2.57298277425204, + "grad_norm": 0.46809186019585464, + "learning_rate": 4.493068056642247e-05, + "loss": 0.8694, + "step": 5676 + }, + { + "epoch": 2.5734360834088847, + "grad_norm": 0.36876933227772707, + "learning_rate": 4.491811808168986e-05, + "loss": 0.8792, + "step": 5677 + }, + { + "epoch": 2.5738893925657296, + "grad_norm": 0.31554351649234047, + "learning_rate": 4.490555510439327e-05, + "loss": 0.8794, + "step": 5678 + }, + { + "epoch": 2.574342701722575, + "grad_norm": 0.2516346799971417, + "learning_rate": 4.489299163579092e-05, + "loss": 0.8914, + "step": 5679 + }, + { + "epoch": 2.5747960108794197, + "grad_norm": 0.3021373522114211, + "learning_rate": 4.488042767714107e-05, + "loss": 0.8881, + "step": 5680 + }, + { + "epoch": 2.5752493200362645, + "grad_norm": 0.3561738725728908, + "learning_rate": 4.486786322970205e-05, + "loss": 0.8791, + "step": 5681 + }, + { + "epoch": 2.5757026291931098, + "grad_norm": 0.40481507725258103, + "learning_rate": 4.4855298294732214e-05, + "loss": 0.8639, + "step": 5682 + }, + { + "epoch": 2.5761559383499546, + "grad_norm": 0.3978864515071203, + "learning_rate": 4.484273287348999e-05, + "loss": 0.8777, + "step": 5683 + }, + { + "epoch": 2.5766092475067994, + "grad_norm": 0.4046163241972828, + "learning_rate": 4.4830166967233845e-05, + "loss": 0.8778, + "step": 5684 + }, + { + "epoch": 2.5770625566636447, + "grad_norm": 0.41194815890983666, + "learning_rate": 4.4817600577222267e-05, + "loss": 0.8969, + "step": 5685 + }, + { + "epoch": 2.5775158658204895, + "grad_norm": 0.3662285400144278, + "learning_rate": 4.4805033704713854e-05, + "loss": 0.8543, + "step": 5686 + }, + { + "epoch": 2.5779691749773344, + "grad_norm": 0.38940231410012943, + "learning_rate": 4.4792466350967176e-05, + "loss": 0.8774, + "step": 5687 + }, + { + "epoch": 2.5784224841341796, + "grad_norm": 0.3778322286939644, + "learning_rate": 4.477989851724092e-05, + "loss": 0.8645, + "step": 5688 + }, + { + "epoch": 2.5788757932910245, + "grad_norm": 0.3984418388550153, + "learning_rate": 4.476733020479377e-05, + "loss": 0.8853, + "step": 5689 + }, + { + "epoch": 2.5793291024478693, + "grad_norm": 0.4558898268556811, + "learning_rate": 4.475476141488449e-05, + "loss": 0.8897, + "step": 5690 + }, + { + "epoch": 2.5797824116047146, + "grad_norm": 0.5124880709368048, + "learning_rate": 4.474219214877188e-05, + "loss": 0.8793, + "step": 5691 + }, + { + "epoch": 2.5802357207615594, + "grad_norm": 0.5037651816298417, + "learning_rate": 4.4729622407714796e-05, + "loss": 0.8994, + "step": 5692 + }, + { + "epoch": 2.580689029918404, + "grad_norm": 0.45409391699949103, + "learning_rate": 4.471705219297212e-05, + "loss": 0.8639, + "step": 5693 + }, + { + "epoch": 2.5811423390752495, + "grad_norm": 0.538336537054793, + "learning_rate": 4.4704481505802804e-05, + "loss": 0.8855, + "step": 5694 + }, + { + "epoch": 2.5815956482320943, + "grad_norm": 0.46643142130707904, + "learning_rate": 4.469191034746584e-05, + "loss": 0.8633, + "step": 5695 + }, + { + "epoch": 2.582048957388939, + "grad_norm": 0.3687408265800859, + "learning_rate": 4.467933871922027e-05, + "loss": 0.8835, + "step": 5696 + }, + { + "epoch": 2.5825022665457844, + "grad_norm": 0.33312167285425454, + "learning_rate": 4.466676662232518e-05, + "loss": 0.8926, + "step": 5697 + }, + { + "epoch": 2.5829555757026292, + "grad_norm": 0.38507409307182017, + "learning_rate": 4.465419405803968e-05, + "loss": 0.88, + "step": 5698 + }, + { + "epoch": 2.583408884859474, + "grad_norm": 0.31455941661455844, + "learning_rate": 4.464162102762298e-05, + "loss": 0.8788, + "step": 5699 + }, + { + "epoch": 2.5838621940163193, + "grad_norm": 0.2870655576933641, + "learning_rate": 4.4629047532334286e-05, + "loss": 0.8886, + "step": 5700 + }, + { + "epoch": 2.584315503173164, + "grad_norm": 0.38957549754911147, + "learning_rate": 4.461647357343289e-05, + "loss": 0.8814, + "step": 5701 + }, + { + "epoch": 2.584768812330009, + "grad_norm": 0.4472498700449439, + "learning_rate": 4.460389915217809e-05, + "loss": 0.8785, + "step": 5702 + }, + { + "epoch": 2.5852221214868543, + "grad_norm": 0.43267581734007454, + "learning_rate": 4.459132426982925e-05, + "loss": 0.9012, + "step": 5703 + }, + { + "epoch": 2.585675430643699, + "grad_norm": 0.35345552873867153, + "learning_rate": 4.4578748927645796e-05, + "loss": 0.8727, + "step": 5704 + }, + { + "epoch": 2.586128739800544, + "grad_norm": 0.33847157789730825, + "learning_rate": 4.456617312688718e-05, + "loss": 0.8849, + "step": 5705 + }, + { + "epoch": 2.586582048957389, + "grad_norm": 0.32726355554938663, + "learning_rate": 4.4553596868812904e-05, + "loss": 0.8971, + "step": 5706 + }, + { + "epoch": 2.587035358114234, + "grad_norm": 0.3188935570137418, + "learning_rate": 4.4541020154682535e-05, + "loss": 0.8634, + "step": 5707 + }, + { + "epoch": 2.587488667271079, + "grad_norm": 0.32320672831995667, + "learning_rate": 4.452844298575563e-05, + "loss": 0.8573, + "step": 5708 + }, + { + "epoch": 2.587941976427924, + "grad_norm": 0.3764928638710812, + "learning_rate": 4.4515865363291865e-05, + "loss": 0.8832, + "step": 5709 + }, + { + "epoch": 2.588395285584769, + "grad_norm": 0.36821670207276413, + "learning_rate": 4.4503287288550904e-05, + "loss": 0.8694, + "step": 5710 + }, + { + "epoch": 2.5888485947416138, + "grad_norm": 0.2917796649779044, + "learning_rate": 4.449070876279249e-05, + "loss": 0.8525, + "step": 5711 + }, + { + "epoch": 2.5893019038984586, + "grad_norm": 0.37878419367007904, + "learning_rate": 4.4478129787276395e-05, + "loss": 0.8866, + "step": 5712 + }, + { + "epoch": 2.589755213055304, + "grad_norm": 0.4591090524141822, + "learning_rate": 4.446555036326244e-05, + "loss": 0.8802, + "step": 5713 + }, + { + "epoch": 2.5902085222121487, + "grad_norm": 0.41488536371220175, + "learning_rate": 4.44529704920105e-05, + "loss": 0.8839, + "step": 5714 + }, + { + "epoch": 2.5906618313689935, + "grad_norm": 0.37512418539099823, + "learning_rate": 4.444039017478047e-05, + "loss": 0.8931, + "step": 5715 + }, + { + "epoch": 2.5911151405258384, + "grad_norm": 0.30735977714735246, + "learning_rate": 4.442780941283231e-05, + "loss": 0.8724, + "step": 5716 + }, + { + "epoch": 2.5915684496826836, + "grad_norm": 0.3102749851917463, + "learning_rate": 4.441522820742604e-05, + "loss": 0.8964, + "step": 5717 + }, + { + "epoch": 2.5920217588395285, + "grad_norm": 0.29494984594005885, + "learning_rate": 4.440264655982167e-05, + "loss": 0.8874, + "step": 5718 + }, + { + "epoch": 2.5924750679963733, + "grad_norm": 0.26271712901420585, + "learning_rate": 4.439006447127932e-05, + "loss": 0.8741, + "step": 5719 + }, + { + "epoch": 2.5929283771532186, + "grad_norm": 0.28180180956807316, + "learning_rate": 4.43774819430591e-05, + "loss": 0.8709, + "step": 5720 + }, + { + "epoch": 2.5933816863100634, + "grad_norm": 0.31388145979064624, + "learning_rate": 4.436489897642121e-05, + "loss": 0.8752, + "step": 5721 + }, + { + "epoch": 2.593834995466908, + "grad_norm": 0.30528708570407836, + "learning_rate": 4.435231557262585e-05, + "loss": 0.9016, + "step": 5722 + }, + { + "epoch": 2.5942883046237535, + "grad_norm": 0.28724868025605355, + "learning_rate": 4.43397317329333e-05, + "loss": 0.919, + "step": 5723 + }, + { + "epoch": 2.5947416137805983, + "grad_norm": 0.334406618582032, + "learning_rate": 4.432714745860386e-05, + "loss": 0.8649, + "step": 5724 + }, + { + "epoch": 2.595194922937443, + "grad_norm": 0.3212619367980426, + "learning_rate": 4.4314562750897886e-05, + "loss": 0.8947, + "step": 5725 + }, + { + "epoch": 2.5956482320942884, + "grad_norm": 0.27257397649182113, + "learning_rate": 4.430197761107578e-05, + "loss": 0.8742, + "step": 5726 + }, + { + "epoch": 2.5961015412511332, + "grad_norm": 0.2770993557847155, + "learning_rate": 4.4289392040397964e-05, + "loss": 0.8698, + "step": 5727 + }, + { + "epoch": 2.596554850407978, + "grad_norm": 0.2905269956290974, + "learning_rate": 4.4276806040124926e-05, + "loss": 0.8898, + "step": 5728 + }, + { + "epoch": 2.5970081595648233, + "grad_norm": 0.3669825202668953, + "learning_rate": 4.42642196115172e-05, + "loss": 0.8777, + "step": 5729 + }, + { + "epoch": 2.597461468721668, + "grad_norm": 0.38848500141120024, + "learning_rate": 4.425163275583535e-05, + "loss": 0.8626, + "step": 5730 + }, + { + "epoch": 2.597914777878513, + "grad_norm": 0.4068459386458721, + "learning_rate": 4.4239045474339986e-05, + "loss": 0.87, + "step": 5731 + }, + { + "epoch": 2.5983680870353583, + "grad_norm": 0.5323635823057011, + "learning_rate": 4.422645776829175e-05, + "loss": 0.8858, + "step": 5732 + }, + { + "epoch": 2.598821396192203, + "grad_norm": 0.6153081262938523, + "learning_rate": 4.421386963895135e-05, + "loss": 0.8776, + "step": 5733 + }, + { + "epoch": 2.599274705349048, + "grad_norm": 0.5952133421207828, + "learning_rate": 4.4201281087579515e-05, + "loss": 0.8816, + "step": 5734 + }, + { + "epoch": 2.599728014505893, + "grad_norm": 0.4418600172885792, + "learning_rate": 4.4188692115437036e-05, + "loss": 0.8749, + "step": 5735 + }, + { + "epoch": 2.600181323662738, + "grad_norm": 0.43257293809905917, + "learning_rate": 4.4176102723784745e-05, + "loss": 0.8893, + "step": 5736 + }, + { + "epoch": 2.600634632819583, + "grad_norm": 0.36171052635555245, + "learning_rate": 4.416351291388348e-05, + "loss": 0.8877, + "step": 5737 + }, + { + "epoch": 2.601087941976428, + "grad_norm": 0.28391258487368526, + "learning_rate": 4.4150922686994154e-05, + "loss": 0.874, + "step": 5738 + }, + { + "epoch": 2.601541251133273, + "grad_norm": 0.28673330381657725, + "learning_rate": 4.4138332044377735e-05, + "loss": 0.862, + "step": 5739 + }, + { + "epoch": 2.6019945602901178, + "grad_norm": 0.35798124729619574, + "learning_rate": 4.412574098729519e-05, + "loss": 0.8715, + "step": 5740 + }, + { + "epoch": 2.602447869446963, + "grad_norm": 0.3938417628184884, + "learning_rate": 4.411314951700757e-05, + "loss": 0.88, + "step": 5741 + }, + { + "epoch": 2.602901178603808, + "grad_norm": 0.43526997022609004, + "learning_rate": 4.410055763477592e-05, + "loss": 0.8432, + "step": 5742 + }, + { + "epoch": 2.6033544877606527, + "grad_norm": 0.4282135460555934, + "learning_rate": 4.408796534186139e-05, + "loss": 0.887, + "step": 5743 + }, + { + "epoch": 2.603807796917498, + "grad_norm": 0.39738628391823794, + "learning_rate": 4.407537263952511e-05, + "loss": 0.8687, + "step": 5744 + }, + { + "epoch": 2.604261106074343, + "grad_norm": 0.35405538378759904, + "learning_rate": 4.4062779529028295e-05, + "loss": 0.8758, + "step": 5745 + }, + { + "epoch": 2.6047144152311876, + "grad_norm": 0.29182031317671425, + "learning_rate": 4.405018601163216e-05, + "loss": 0.9, + "step": 5746 + }, + { + "epoch": 2.605167724388033, + "grad_norm": 0.2979642135646095, + "learning_rate": 4.4037592088597984e-05, + "loss": 0.8805, + "step": 5747 + }, + { + "epoch": 2.6056210335448777, + "grad_norm": 0.31888868520563146, + "learning_rate": 4.40249977611871e-05, + "loss": 0.9005, + "step": 5748 + }, + { + "epoch": 2.6060743427017226, + "grad_norm": 0.3764903222339265, + "learning_rate": 4.401240303066086e-05, + "loss": 0.8766, + "step": 5749 + }, + { + "epoch": 2.6065276518585674, + "grad_norm": 0.3960736626450706, + "learning_rate": 4.399980789828066e-05, + "loss": 0.8803, + "step": 5750 + }, + { + "epoch": 2.6069809610154127, + "grad_norm": 0.4226663923030406, + "learning_rate": 4.398721236530795e-05, + "loss": 0.8602, + "step": 5751 + }, + { + "epoch": 2.6074342701722575, + "grad_norm": 0.4798458895826599, + "learning_rate": 4.397461643300419e-05, + "loss": 0.8825, + "step": 5752 + }, + { + "epoch": 2.6078875793291023, + "grad_norm": 0.48902747945191605, + "learning_rate": 4.3962020102630935e-05, + "loss": 0.8669, + "step": 5753 + }, + { + "epoch": 2.608340888485947, + "grad_norm": 0.496817597490097, + "learning_rate": 4.3949423375449705e-05, + "loss": 0.8892, + "step": 5754 + }, + { + "epoch": 2.6087941976427924, + "grad_norm": 0.42924772479477696, + "learning_rate": 4.393682625272214e-05, + "loss": 0.8785, + "step": 5755 + }, + { + "epoch": 2.6092475067996372, + "grad_norm": 0.3372504705196389, + "learning_rate": 4.392422873570984e-05, + "loss": 0.8524, + "step": 5756 + }, + { + "epoch": 2.609700815956482, + "grad_norm": 0.30239142715779005, + "learning_rate": 4.3911630825674487e-05, + "loss": 0.8712, + "step": 5757 + }, + { + "epoch": 2.6101541251133273, + "grad_norm": 0.33542830465976714, + "learning_rate": 4.389903252387783e-05, + "loss": 0.8601, + "step": 5758 + }, + { + "epoch": 2.610607434270172, + "grad_norm": 0.3718406346042426, + "learning_rate": 4.388643383158161e-05, + "loss": 0.9045, + "step": 5759 + }, + { + "epoch": 2.611060743427017, + "grad_norm": 0.3762213750421688, + "learning_rate": 4.38738347500476e-05, + "loss": 0.8766, + "step": 5760 + }, + { + "epoch": 2.6115140525838623, + "grad_norm": 0.3912593223255763, + "learning_rate": 4.3861235280537666e-05, + "loss": 0.877, + "step": 5761 + }, + { + "epoch": 2.611967361740707, + "grad_norm": 0.3331138494581023, + "learning_rate": 4.384863542431367e-05, + "loss": 0.8739, + "step": 5762 + }, + { + "epoch": 2.612420670897552, + "grad_norm": 0.3110998285460133, + "learning_rate": 4.383603518263752e-05, + "loss": 0.8799, + "step": 5763 + }, + { + "epoch": 2.612873980054397, + "grad_norm": 0.25888158248503396, + "learning_rate": 4.382343455677118e-05, + "loss": 0.8805, + "step": 5764 + }, + { + "epoch": 2.613327289211242, + "grad_norm": 0.30273122226976784, + "learning_rate": 4.3810833547976636e-05, + "loss": 0.8785, + "step": 5765 + }, + { + "epoch": 2.613780598368087, + "grad_norm": 0.3138459749583387, + "learning_rate": 4.3798232157515905e-05, + "loss": 0.8732, + "step": 5766 + }, + { + "epoch": 2.614233907524932, + "grad_norm": 0.346331516778546, + "learning_rate": 4.378563038665106e-05, + "loss": 0.8774, + "step": 5767 + }, + { + "epoch": 2.614687216681777, + "grad_norm": 0.29181769816428743, + "learning_rate": 4.3773028236644206e-05, + "loss": 0.8717, + "step": 5768 + }, + { + "epoch": 2.6151405258386218, + "grad_norm": 0.2764648938893759, + "learning_rate": 4.3760425708757485e-05, + "loss": 0.8813, + "step": 5769 + }, + { + "epoch": 2.615593834995467, + "grad_norm": 0.3262987673122555, + "learning_rate": 4.374782280425308e-05, + "loss": 0.8558, + "step": 5770 + }, + { + "epoch": 2.616047144152312, + "grad_norm": 0.37043506128123976, + "learning_rate": 4.373521952439321e-05, + "loss": 0.8783, + "step": 5771 + }, + { + "epoch": 2.6165004533091567, + "grad_norm": 0.3240804085931681, + "learning_rate": 4.372261587044012e-05, + "loss": 0.8651, + "step": 5772 + }, + { + "epoch": 2.616953762466002, + "grad_norm": 0.24228842606808515, + "learning_rate": 4.3710011843656115e-05, + "loss": 0.8865, + "step": 5773 + }, + { + "epoch": 2.617407071622847, + "grad_norm": 0.3504785523642531, + "learning_rate": 4.369740744530353e-05, + "loss": 0.8455, + "step": 5774 + }, + { + "epoch": 2.6178603807796916, + "grad_norm": 0.41333872726366994, + "learning_rate": 4.3684802676644705e-05, + "loss": 0.889, + "step": 5775 + }, + { + "epoch": 2.618313689936537, + "grad_norm": 0.3853045508958026, + "learning_rate": 4.3672197538942064e-05, + "loss": 0.8999, + "step": 5776 + }, + { + "epoch": 2.6187669990933817, + "grad_norm": 0.33784276181636025, + "learning_rate": 4.3659592033458055e-05, + "loss": 0.8885, + "step": 5777 + }, + { + "epoch": 2.6192203082502266, + "grad_norm": 0.407903777800908, + "learning_rate": 4.3646986161455145e-05, + "loss": 0.8792, + "step": 5778 + }, + { + "epoch": 2.619673617407072, + "grad_norm": 0.2994795091131912, + "learning_rate": 4.3634379924195846e-05, + "loss": 0.8895, + "step": 5779 + }, + { + "epoch": 2.6201269265639167, + "grad_norm": 0.25911724616859616, + "learning_rate": 4.3621773322942725e-05, + "loss": 0.8901, + "step": 5780 + }, + { + "epoch": 2.6205802357207615, + "grad_norm": 0.24772031148080917, + "learning_rate": 4.3609166358958356e-05, + "loss": 0.8631, + "step": 5781 + }, + { + "epoch": 2.6210335448776068, + "grad_norm": 0.27102325218243517, + "learning_rate": 4.359655903350537e-05, + "loss": 0.8695, + "step": 5782 + }, + { + "epoch": 2.6214868540344516, + "grad_norm": 0.4073579407986265, + "learning_rate": 4.3583951347846434e-05, + "loss": 0.8674, + "step": 5783 + }, + { + "epoch": 2.6219401631912964, + "grad_norm": 0.46448883033579563, + "learning_rate": 4.357134330324424e-05, + "loss": 0.8697, + "step": 5784 + }, + { + "epoch": 2.6223934723481417, + "grad_norm": 0.398192848795283, + "learning_rate": 4.355873490096151e-05, + "loss": 0.852, + "step": 5785 + }, + { + "epoch": 2.6228467815049865, + "grad_norm": 0.35656782741186027, + "learning_rate": 4.3546126142261024e-05, + "loss": 0.8777, + "step": 5786 + }, + { + "epoch": 2.6233000906618313, + "grad_norm": 0.3096534602833606, + "learning_rate": 4.3533517028405574e-05, + "loss": 0.8818, + "step": 5787 + }, + { + "epoch": 2.6237533998186766, + "grad_norm": 0.2823762509696381, + "learning_rate": 4.352090756065802e-05, + "loss": 0.8824, + "step": 5788 + }, + { + "epoch": 2.6242067089755214, + "grad_norm": 0.28390716482579975, + "learning_rate": 4.350829774028122e-05, + "loss": 0.8856, + "step": 5789 + }, + { + "epoch": 2.6246600181323663, + "grad_norm": 0.33504735790868795, + "learning_rate": 4.349568756853809e-05, + "loss": 0.8814, + "step": 5790 + }, + { + "epoch": 2.625113327289211, + "grad_norm": 0.3173248374694659, + "learning_rate": 4.348307704669158e-05, + "loss": 0.8826, + "step": 5791 + }, + { + "epoch": 2.6255666364460564, + "grad_norm": 0.25683793637902996, + "learning_rate": 4.347046617600466e-05, + "loss": 0.8768, + "step": 5792 + }, + { + "epoch": 2.626019945602901, + "grad_norm": 0.2883396588216147, + "learning_rate": 4.345785495774037e-05, + "loss": 0.9054, + "step": 5793 + }, + { + "epoch": 2.626473254759746, + "grad_norm": 0.2967380206914427, + "learning_rate": 4.3445243393161726e-05, + "loss": 0.8776, + "step": 5794 + }, + { + "epoch": 2.626926563916591, + "grad_norm": 0.3195175924225291, + "learning_rate": 4.343263148353184e-05, + "loss": 0.8714, + "step": 5795 + }, + { + "epoch": 2.627379873073436, + "grad_norm": 0.2568592066592151, + "learning_rate": 4.342001923011382e-05, + "loss": 0.8872, + "step": 5796 + }, + { + "epoch": 2.627833182230281, + "grad_norm": 0.37513272465755443, + "learning_rate": 4.340740663417082e-05, + "loss": 0.8916, + "step": 5797 + }, + { + "epoch": 2.628286491387126, + "grad_norm": 0.46966032949976744, + "learning_rate": 4.339479369696603e-05, + "loss": 0.884, + "step": 5798 + }, + { + "epoch": 2.628739800543971, + "grad_norm": 0.533877454098109, + "learning_rate": 4.3382180419762686e-05, + "loss": 0.8733, + "step": 5799 + }, + { + "epoch": 2.629193109700816, + "grad_norm": 0.575954919668876, + "learning_rate": 4.336956680382402e-05, + "loss": 0.8541, + "step": 5800 + }, + { + "epoch": 2.6296464188576607, + "grad_norm": 0.5676695046754546, + "learning_rate": 4.335695285041334e-05, + "loss": 0.8503, + "step": 5801 + }, + { + "epoch": 2.630099728014506, + "grad_norm": 0.5052112294078548, + "learning_rate": 4.334433856079397e-05, + "loss": 0.8904, + "step": 5802 + }, + { + "epoch": 2.630553037171351, + "grad_norm": 0.3904114255702677, + "learning_rate": 4.333172393622927e-05, + "loss": 0.8829, + "step": 5803 + }, + { + "epoch": 2.6310063463281956, + "grad_norm": 0.24833833698031302, + "learning_rate": 4.331910897798261e-05, + "loss": 0.8613, + "step": 5804 + }, + { + "epoch": 2.631459655485041, + "grad_norm": 0.2509472236836431, + "learning_rate": 4.330649368731744e-05, + "loss": 0.8906, + "step": 5805 + }, + { + "epoch": 2.6319129646418857, + "grad_norm": 0.28336860891557325, + "learning_rate": 4.3293878065497204e-05, + "loss": 0.8807, + "step": 5806 + }, + { + "epoch": 2.6323662737987306, + "grad_norm": 0.2940543741282835, + "learning_rate": 4.328126211378541e-05, + "loss": 0.8982, + "step": 5807 + }, + { + "epoch": 2.632819582955576, + "grad_norm": 0.30973995361779977, + "learning_rate": 4.326864583344556e-05, + "loss": 0.9136, + "step": 5808 + }, + { + "epoch": 2.6332728921124207, + "grad_norm": 0.3730836125039869, + "learning_rate": 4.3256029225741226e-05, + "loss": 0.8757, + "step": 5809 + }, + { + "epoch": 2.6337262012692655, + "grad_norm": 0.5724804258326537, + "learning_rate": 4.3243412291936e-05, + "loss": 0.8931, + "step": 5810 + }, + { + "epoch": 2.6341795104261108, + "grad_norm": 0.6626960231900365, + "learning_rate": 4.323079503329349e-05, + "loss": 0.8765, + "step": 5811 + }, + { + "epoch": 2.6346328195829556, + "grad_norm": 0.6742700042781729, + "learning_rate": 4.321817745107739e-05, + "loss": 0.8764, + "step": 5812 + }, + { + "epoch": 2.6350861287398004, + "grad_norm": 0.6473070579099056, + "learning_rate": 4.3205559546551336e-05, + "loss": 0.8954, + "step": 5813 + }, + { + "epoch": 2.6355394378966457, + "grad_norm": 0.5648002729682227, + "learning_rate": 4.319294132097908e-05, + "loss": 0.8819, + "step": 5814 + }, + { + "epoch": 2.6359927470534905, + "grad_norm": 0.4221391860665615, + "learning_rate": 4.318032277562436e-05, + "loss": 0.8645, + "step": 5815 + }, + { + "epoch": 2.6364460562103353, + "grad_norm": 0.4906998769809952, + "learning_rate": 4.316770391175098e-05, + "loss": 0.8644, + "step": 5816 + }, + { + "epoch": 2.6368993653671806, + "grad_norm": 0.4792628542468413, + "learning_rate": 4.315508473062273e-05, + "loss": 0.865, + "step": 5817 + }, + { + "epoch": 2.6373526745240254, + "grad_norm": 0.2917770591894847, + "learning_rate": 4.314246523350347e-05, + "loss": 0.8911, + "step": 5818 + }, + { + "epoch": 2.6378059836808703, + "grad_norm": 0.4186093173669434, + "learning_rate": 4.312984542165709e-05, + "loss": 0.8679, + "step": 5819 + }, + { + "epoch": 2.6382592928377155, + "grad_norm": 0.4908113792354741, + "learning_rate": 4.31172252963475e-05, + "loss": 0.8751, + "step": 5820 + }, + { + "epoch": 2.6387126019945604, + "grad_norm": 0.47287779239461586, + "learning_rate": 4.310460485883861e-05, + "loss": 0.8811, + "step": 5821 + }, + { + "epoch": 2.639165911151405, + "grad_norm": 0.49025102062293935, + "learning_rate": 4.309198411039445e-05, + "loss": 0.8871, + "step": 5822 + }, + { + "epoch": 2.6396192203082505, + "grad_norm": 0.5392233828214541, + "learning_rate": 4.307936305227898e-05, + "loss": 0.8717, + "step": 5823 + }, + { + "epoch": 2.6400725294650953, + "grad_norm": 0.5147894169829097, + "learning_rate": 4.306674168575624e-05, + "loss": 0.8676, + "step": 5824 + }, + { + "epoch": 2.64052583862194, + "grad_norm": 0.4589206285368694, + "learning_rate": 4.305412001209032e-05, + "loss": 0.8799, + "step": 5825 + }, + { + "epoch": 2.6409791477787854, + "grad_norm": 0.3678671390527449, + "learning_rate": 4.304149803254531e-05, + "loss": 0.9039, + "step": 5826 + }, + { + "epoch": 2.6414324569356302, + "grad_norm": 0.3578955645728604, + "learning_rate": 4.302887574838533e-05, + "loss": 0.8642, + "step": 5827 + }, + { + "epoch": 2.641885766092475, + "grad_norm": 0.2972067330399155, + "learning_rate": 4.301625316087453e-05, + "loss": 0.8768, + "step": 5828 + }, + { + "epoch": 2.64233907524932, + "grad_norm": 0.2762311223534963, + "learning_rate": 4.300363027127712e-05, + "loss": 0.8812, + "step": 5829 + }, + { + "epoch": 2.642792384406165, + "grad_norm": 0.3240156154705935, + "learning_rate": 4.29910070808573e-05, + "loss": 0.9028, + "step": 5830 + }, + { + "epoch": 2.64324569356301, + "grad_norm": 0.2854057619356003, + "learning_rate": 4.2978383590879344e-05, + "loss": 0.886, + "step": 5831 + }, + { + "epoch": 2.643699002719855, + "grad_norm": 0.3097857186700373, + "learning_rate": 4.296575980260752e-05, + "loss": 0.889, + "step": 5832 + }, + { + "epoch": 2.6441523118766996, + "grad_norm": 0.31030925565946493, + "learning_rate": 4.295313571730613e-05, + "loss": 0.9016, + "step": 5833 + }, + { + "epoch": 2.644605621033545, + "grad_norm": 0.32696981343736176, + "learning_rate": 4.294051133623952e-05, + "loss": 0.8755, + "step": 5834 + }, + { + "epoch": 2.6450589301903897, + "grad_norm": 0.3638061627642944, + "learning_rate": 4.292788666067205e-05, + "loss": 0.8828, + "step": 5835 + }, + { + "epoch": 2.6455122393472346, + "grad_norm": 0.33660761357923985, + "learning_rate": 4.2915261691868125e-05, + "loss": 0.8679, + "step": 5836 + }, + { + "epoch": 2.64596554850408, + "grad_norm": 0.29122783392771034, + "learning_rate": 4.2902636431092184e-05, + "loss": 0.8673, + "step": 5837 + }, + { + "epoch": 2.6464188576609247, + "grad_norm": 0.3223695944492579, + "learning_rate": 4.2890010879608674e-05, + "loss": 0.8864, + "step": 5838 + }, + { + "epoch": 2.6468721668177695, + "grad_norm": 0.32203878766488553, + "learning_rate": 4.287738503868207e-05, + "loss": 0.899, + "step": 5839 + }, + { + "epoch": 2.6473254759746148, + "grad_norm": 0.3229040503272708, + "learning_rate": 4.286475890957691e-05, + "loss": 0.8774, + "step": 5840 + }, + { + "epoch": 2.6477787851314596, + "grad_norm": 0.39220606244012624, + "learning_rate": 4.2852132493557726e-05, + "loss": 0.9141, + "step": 5841 + }, + { + "epoch": 2.6482320942883044, + "grad_norm": 0.4317348318583508, + "learning_rate": 4.283950579188908e-05, + "loss": 0.881, + "step": 5842 + }, + { + "epoch": 2.6486854034451497, + "grad_norm": 0.5045309772359299, + "learning_rate": 4.28268788058356e-05, + "loss": 0.9088, + "step": 5843 + }, + { + "epoch": 2.6491387126019945, + "grad_norm": 0.45862834633589494, + "learning_rate": 4.281425153666188e-05, + "loss": 0.8779, + "step": 5844 + }, + { + "epoch": 2.6495920217588393, + "grad_norm": 0.460202085405437, + "learning_rate": 4.2801623985632606e-05, + "loss": 0.8935, + "step": 5845 + }, + { + "epoch": 2.6500453309156846, + "grad_norm": 0.4816735816274237, + "learning_rate": 4.278899615401246e-05, + "loss": 0.8914, + "step": 5846 + }, + { + "epoch": 2.6504986400725294, + "grad_norm": 0.5370030872549255, + "learning_rate": 4.277636804306615e-05, + "loss": 0.8755, + "step": 5847 + }, + { + "epoch": 2.6509519492293743, + "grad_norm": 0.5231839421615454, + "learning_rate": 4.276373965405842e-05, + "loss": 0.8609, + "step": 5848 + }, + { + "epoch": 2.6514052583862195, + "grad_norm": 0.432143436365859, + "learning_rate": 4.275111098825403e-05, + "loss": 0.8827, + "step": 5849 + }, + { + "epoch": 2.6518585675430644, + "grad_norm": 0.34828069710668724, + "learning_rate": 4.273848204691781e-05, + "loss": 0.8695, + "step": 5850 + }, + { + "epoch": 2.652311876699909, + "grad_norm": 0.3099611075170249, + "learning_rate": 4.2725852831314556e-05, + "loss": 0.8742, + "step": 5851 + }, + { + "epoch": 2.6527651858567545, + "grad_norm": 0.24953142850699706, + "learning_rate": 4.271322334270912e-05, + "loss": 0.8969, + "step": 5852 + }, + { + "epoch": 2.6532184950135993, + "grad_norm": 0.2713176958787826, + "learning_rate": 4.270059358236641e-05, + "loss": 0.8807, + "step": 5853 + }, + { + "epoch": 2.653671804170444, + "grad_norm": 0.30820993441507577, + "learning_rate": 4.2687963551551305e-05, + "loss": 0.88, + "step": 5854 + }, + { + "epoch": 2.6541251133272894, + "grad_norm": 0.30084053166345714, + "learning_rate": 4.267533325152875e-05, + "loss": 0.881, + "step": 5855 + }, + { + "epoch": 2.6545784224841342, + "grad_norm": 0.3985092286164874, + "learning_rate": 4.266270268356372e-05, + "loss": 0.8821, + "step": 5856 + }, + { + "epoch": 2.655031731640979, + "grad_norm": 0.4399328374528351, + "learning_rate": 4.265007184892117e-05, + "loss": 0.867, + "step": 5857 + }, + { + "epoch": 2.6554850407978243, + "grad_norm": 0.47195598164238695, + "learning_rate": 4.263744074886615e-05, + "loss": 0.8941, + "step": 5858 + }, + { + "epoch": 2.655938349954669, + "grad_norm": 0.44588914378453715, + "learning_rate": 4.262480938466368e-05, + "loss": 0.8638, + "step": 5859 + }, + { + "epoch": 2.656391659111514, + "grad_norm": 0.39348822395865923, + "learning_rate": 4.261217775757884e-05, + "loss": 0.8856, + "step": 5860 + }, + { + "epoch": 2.6568449682683593, + "grad_norm": 0.4390186223899267, + "learning_rate": 4.259954586887673e-05, + "loss": 0.8641, + "step": 5861 + }, + { + "epoch": 2.657298277425204, + "grad_norm": 0.48441472010317743, + "learning_rate": 4.258691371982244e-05, + "loss": 0.8889, + "step": 5862 + }, + { + "epoch": 2.657751586582049, + "grad_norm": 0.3694786839760859, + "learning_rate": 4.257428131168116e-05, + "loss": 0.8795, + "step": 5863 + }, + { + "epoch": 2.658204895738894, + "grad_norm": 0.31080582528072104, + "learning_rate": 4.2561648645718024e-05, + "loss": 0.8937, + "step": 5864 + }, + { + "epoch": 2.658658204895739, + "grad_norm": 0.35847847409543554, + "learning_rate": 4.254901572319825e-05, + "loss": 0.8771, + "step": 5865 + }, + { + "epoch": 2.659111514052584, + "grad_norm": 0.3584365745364404, + "learning_rate": 4.2536382545387065e-05, + "loss": 0.8614, + "step": 5866 + }, + { + "epoch": 2.659564823209429, + "grad_norm": 0.31289904643898675, + "learning_rate": 4.252374911354971e-05, + "loss": 0.8842, + "step": 5867 + }, + { + "epoch": 2.660018132366274, + "grad_norm": 0.3348199855557196, + "learning_rate": 4.2511115428951465e-05, + "loss": 0.8689, + "step": 5868 + }, + { + "epoch": 2.6604714415231188, + "grad_norm": 0.4257807747875663, + "learning_rate": 4.249848149285765e-05, + "loss": 0.8664, + "step": 5869 + }, + { + "epoch": 2.6609247506799636, + "grad_norm": 0.5494774627436787, + "learning_rate": 4.248584730653354e-05, + "loss": 0.8872, + "step": 5870 + }, + { + "epoch": 2.661378059836809, + "grad_norm": 0.49897662152774386, + "learning_rate": 4.2473212871244535e-05, + "loss": 0.8685, + "step": 5871 + }, + { + "epoch": 2.6618313689936537, + "grad_norm": 0.3995748871341462, + "learning_rate": 4.246057818825599e-05, + "loss": 0.872, + "step": 5872 + }, + { + "epoch": 2.6622846781504985, + "grad_norm": 0.37497146991864033, + "learning_rate": 4.24479432588333e-05, + "loss": 0.8728, + "step": 5873 + }, + { + "epoch": 2.6627379873073433, + "grad_norm": 0.3627815118404913, + "learning_rate": 4.24353080842419e-05, + "loss": 0.8703, + "step": 5874 + }, + { + "epoch": 2.6631912964641886, + "grad_norm": 0.2903122854041225, + "learning_rate": 4.242267266574724e-05, + "loss": 0.8742, + "step": 5875 + }, + { + "epoch": 2.6636446056210334, + "grad_norm": 0.28118488735585306, + "learning_rate": 4.2410037004614795e-05, + "loss": 0.8682, + "step": 5876 + }, + { + "epoch": 2.6640979147778783, + "grad_norm": 0.3278685788785086, + "learning_rate": 4.239740110211006e-05, + "loss": 0.8848, + "step": 5877 + }, + { + "epoch": 2.6645512239347235, + "grad_norm": 0.40951786156735076, + "learning_rate": 4.238476495949855e-05, + "loss": 0.8687, + "step": 5878 + }, + { + "epoch": 2.6650045330915684, + "grad_norm": 0.4480763902041912, + "learning_rate": 4.237212857804583e-05, + "loss": 0.8893, + "step": 5879 + }, + { + "epoch": 2.665457842248413, + "grad_norm": 0.40655713162019047, + "learning_rate": 4.235949195901747e-05, + "loss": 0.8809, + "step": 5880 + }, + { + "epoch": 2.6659111514052585, + "grad_norm": 0.35080164050978074, + "learning_rate": 4.234685510367904e-05, + "loss": 0.9031, + "step": 5881 + }, + { + "epoch": 2.6663644605621033, + "grad_norm": 0.2853865448787389, + "learning_rate": 4.233421801329618e-05, + "loss": 0.8632, + "step": 5882 + }, + { + "epoch": 2.666817769718948, + "grad_norm": 0.2772087140132203, + "learning_rate": 4.2321580689134524e-05, + "loss": 0.8826, + "step": 5883 + }, + { + "epoch": 2.6672710788757934, + "grad_norm": 0.2505661280512699, + "learning_rate": 4.2308943132459746e-05, + "loss": 0.8852, + "step": 5884 + }, + { + "epoch": 2.6677243880326382, + "grad_norm": 0.2737373709428012, + "learning_rate": 4.229630534453752e-05, + "loss": 0.8915, + "step": 5885 + }, + { + "epoch": 2.668177697189483, + "grad_norm": 0.41131915031885097, + "learning_rate": 4.228366732663356e-05, + "loss": 0.8815, + "step": 5886 + }, + { + "epoch": 2.6686310063463283, + "grad_norm": 0.3864017882923296, + "learning_rate": 4.227102908001362e-05, + "loss": 0.8627, + "step": 5887 + }, + { + "epoch": 2.669084315503173, + "grad_norm": 0.35560654019003946, + "learning_rate": 4.225839060594343e-05, + "loss": 0.8713, + "step": 5888 + }, + { + "epoch": 2.669537624660018, + "grad_norm": 0.3313132615323578, + "learning_rate": 4.22457519056888e-05, + "loss": 0.8644, + "step": 5889 + }, + { + "epoch": 2.6699909338168633, + "grad_norm": 0.2971964740293958, + "learning_rate": 4.223311298051551e-05, + "loss": 0.8772, + "step": 5890 + }, + { + "epoch": 2.670444242973708, + "grad_norm": 0.27110315038183075, + "learning_rate": 4.222047383168938e-05, + "loss": 0.896, + "step": 5891 + }, + { + "epoch": 2.670897552130553, + "grad_norm": 0.2782855330923926, + "learning_rate": 4.2207834460476274e-05, + "loss": 0.8777, + "step": 5892 + }, + { + "epoch": 2.671350861287398, + "grad_norm": 0.2705988207958185, + "learning_rate": 4.219519486814206e-05, + "loss": 0.8707, + "step": 5893 + }, + { + "epoch": 2.671804170444243, + "grad_norm": 0.27772960933678814, + "learning_rate": 4.218255505595263e-05, + "loss": 0.8871, + "step": 5894 + }, + { + "epoch": 2.672257479601088, + "grad_norm": 0.2883916436668078, + "learning_rate": 4.21699150251739e-05, + "loss": 0.8873, + "step": 5895 + }, + { + "epoch": 2.672710788757933, + "grad_norm": 0.2534342328299184, + "learning_rate": 4.215727477707179e-05, + "loss": 0.8874, + "step": 5896 + }, + { + "epoch": 2.673164097914778, + "grad_norm": 0.2872441968891712, + "learning_rate": 4.214463431291229e-05, + "loss": 0.8759, + "step": 5897 + }, + { + "epoch": 2.6736174070716228, + "grad_norm": 0.28140700323449885, + "learning_rate": 4.213199363396135e-05, + "loss": 0.8767, + "step": 5898 + }, + { + "epoch": 2.674070716228468, + "grad_norm": 0.3159937470273067, + "learning_rate": 4.2119352741484996e-05, + "loss": 0.8641, + "step": 5899 + }, + { + "epoch": 2.674524025385313, + "grad_norm": 0.2652752852045863, + "learning_rate": 4.210671163674922e-05, + "loss": 0.8856, + "step": 5900 + }, + { + "epoch": 2.6749773345421577, + "grad_norm": 0.2557341910114316, + "learning_rate": 4.2094070321020085e-05, + "loss": 0.8694, + "step": 5901 + }, + { + "epoch": 2.675430643699003, + "grad_norm": 0.30183663218291523, + "learning_rate": 4.208142879556366e-05, + "loss": 0.8642, + "step": 5902 + }, + { + "epoch": 2.675883952855848, + "grad_norm": 0.32393197314966465, + "learning_rate": 4.2068787061646005e-05, + "loss": 0.8655, + "step": 5903 + }, + { + "epoch": 2.6763372620126926, + "grad_norm": 0.40389116258897156, + "learning_rate": 4.205614512053326e-05, + "loss": 0.8794, + "step": 5904 + }, + { + "epoch": 2.676790571169538, + "grad_norm": 0.47618337815108375, + "learning_rate": 4.2043502973491534e-05, + "loss": 0.8582, + "step": 5905 + }, + { + "epoch": 2.6772438803263827, + "grad_norm": 0.447245020333917, + "learning_rate": 4.203086062178698e-05, + "loss": 0.8713, + "step": 5906 + }, + { + "epoch": 2.6776971894832275, + "grad_norm": 0.31269328069104824, + "learning_rate": 4.2018218066685765e-05, + "loss": 0.8722, + "step": 5907 + }, + { + "epoch": 2.6781504986400724, + "grad_norm": 0.27478430066642073, + "learning_rate": 4.200557530945409e-05, + "loss": 0.8899, + "step": 5908 + }, + { + "epoch": 2.6786038077969176, + "grad_norm": 0.3675617812163144, + "learning_rate": 4.1992932351358154e-05, + "loss": 0.8737, + "step": 5909 + }, + { + "epoch": 2.6790571169537625, + "grad_norm": 0.41433068559269365, + "learning_rate": 4.198028919366417e-05, + "loss": 0.8612, + "step": 5910 + }, + { + "epoch": 2.6795104261106073, + "grad_norm": 0.5532171218032518, + "learning_rate": 4.1967645837638414e-05, + "loss": 0.8961, + "step": 5911 + }, + { + "epoch": 2.679963735267452, + "grad_norm": 0.7172143926693726, + "learning_rate": 4.195500228454714e-05, + "loss": 0.8862, + "step": 5912 + }, + { + "epoch": 2.6804170444242974, + "grad_norm": 0.7512440389399889, + "learning_rate": 4.194235853565665e-05, + "loss": 0.8804, + "step": 5913 + }, + { + "epoch": 2.6808703535811422, + "grad_norm": 0.6048281808234872, + "learning_rate": 4.192971459223324e-05, + "loss": 0.8912, + "step": 5914 + }, + { + "epoch": 2.681323662737987, + "grad_norm": 0.5224829983366013, + "learning_rate": 4.191707045554323e-05, + "loss": 0.8685, + "step": 5915 + }, + { + "epoch": 2.6817769718948323, + "grad_norm": 0.38414674219716544, + "learning_rate": 4.190442612685299e-05, + "loss": 0.8738, + "step": 5916 + }, + { + "epoch": 2.682230281051677, + "grad_norm": 0.3055878846847181, + "learning_rate": 4.189178160742887e-05, + "loss": 0.8752, + "step": 5917 + }, + { + "epoch": 2.682683590208522, + "grad_norm": 0.26948131135023795, + "learning_rate": 4.187913689853728e-05, + "loss": 0.8804, + "step": 5918 + }, + { + "epoch": 2.6831368993653673, + "grad_norm": 0.2887312826728196, + "learning_rate": 4.1866492001444585e-05, + "loss": 0.8878, + "step": 5919 + }, + { + "epoch": 2.683590208522212, + "grad_norm": 0.33721317577252385, + "learning_rate": 4.1853846917417246e-05, + "loss": 0.8955, + "step": 5920 + }, + { + "epoch": 2.684043517679057, + "grad_norm": 0.29052083687732266, + "learning_rate": 4.184120164772168e-05, + "loss": 0.8722, + "step": 5921 + }, + { + "epoch": 2.684496826835902, + "grad_norm": 0.23176089412570006, + "learning_rate": 4.182855619362436e-05, + "loss": 0.8834, + "step": 5922 + }, + { + "epoch": 2.684950135992747, + "grad_norm": 0.24568353692676947, + "learning_rate": 4.181591055639177e-05, + "loss": 0.8735, + "step": 5923 + }, + { + "epoch": 2.685403445149592, + "grad_norm": 0.30723156833746207, + "learning_rate": 4.180326473729039e-05, + "loss": 0.877, + "step": 5924 + }, + { + "epoch": 2.685856754306437, + "grad_norm": 0.3349188424622035, + "learning_rate": 4.1790618737586765e-05, + "loss": 0.8863, + "step": 5925 + }, + { + "epoch": 2.686310063463282, + "grad_norm": 0.4129836390736259, + "learning_rate": 4.177797255854739e-05, + "loss": 0.9148, + "step": 5926 + }, + { + "epoch": 2.6867633726201268, + "grad_norm": 0.47872457947805086, + "learning_rate": 4.176532620143885e-05, + "loss": 0.8787, + "step": 5927 + }, + { + "epoch": 2.687216681776972, + "grad_norm": 0.5131729817093316, + "learning_rate": 4.1752679667527706e-05, + "loss": 0.8888, + "step": 5928 + }, + { + "epoch": 2.687669990933817, + "grad_norm": 0.5269444960836895, + "learning_rate": 4.174003295808053e-05, + "loss": 0.867, + "step": 5929 + }, + { + "epoch": 2.6881233000906617, + "grad_norm": 0.5673517450366239, + "learning_rate": 4.172738607436395e-05, + "loss": 0.8806, + "step": 5930 + }, + { + "epoch": 2.688576609247507, + "grad_norm": 0.5553724811748905, + "learning_rate": 4.171473901764457e-05, + "loss": 0.8812, + "step": 5931 + }, + { + "epoch": 2.689029918404352, + "grad_norm": 0.4685387232190202, + "learning_rate": 4.170209178918904e-05, + "loss": 0.8681, + "step": 5932 + }, + { + "epoch": 2.6894832275611966, + "grad_norm": 0.3479199295727952, + "learning_rate": 4.168944439026401e-05, + "loss": 0.8716, + "step": 5933 + }, + { + "epoch": 2.689936536718042, + "grad_norm": 0.34764051547335456, + "learning_rate": 4.1676796822136165e-05, + "loss": 0.8893, + "step": 5934 + }, + { + "epoch": 2.6903898458748867, + "grad_norm": 0.36165963866523737, + "learning_rate": 4.166414908607219e-05, + "loss": 0.881, + "step": 5935 + }, + { + "epoch": 2.6908431550317315, + "grad_norm": 0.3444503276447592, + "learning_rate": 4.16515011833388e-05, + "loss": 0.8806, + "step": 5936 + }, + { + "epoch": 2.691296464188577, + "grad_norm": 0.42011275331818937, + "learning_rate": 4.163885311520271e-05, + "loss": 0.8847, + "step": 5937 + }, + { + "epoch": 2.6917497733454216, + "grad_norm": 0.5179075973132685, + "learning_rate": 4.162620488293065e-05, + "loss": 0.8682, + "step": 5938 + }, + { + "epoch": 2.6922030825022665, + "grad_norm": 0.6096728560682791, + "learning_rate": 4.1613556487789405e-05, + "loss": 0.8736, + "step": 5939 + }, + { + "epoch": 2.6926563916591117, + "grad_norm": 0.638169523519956, + "learning_rate": 4.160090793104573e-05, + "loss": 0.8771, + "step": 5940 + }, + { + "epoch": 2.6931097008159566, + "grad_norm": 0.5399307165518655, + "learning_rate": 4.158825921396643e-05, + "loss": 0.8681, + "step": 5941 + }, + { + "epoch": 2.6935630099728014, + "grad_norm": 0.45818116211970467, + "learning_rate": 4.15756103378183e-05, + "loss": 0.8782, + "step": 5942 + }, + { + "epoch": 2.6940163191296467, + "grad_norm": 0.3286360404585114, + "learning_rate": 4.156296130386816e-05, + "loss": 0.8819, + "step": 5943 + }, + { + "epoch": 2.6944696282864915, + "grad_norm": 0.2623091319312336, + "learning_rate": 4.155031211338284e-05, + "loss": 0.8881, + "step": 5944 + }, + { + "epoch": 2.6949229374433363, + "grad_norm": 0.3173689820462578, + "learning_rate": 4.1537662767629216e-05, + "loss": 0.8764, + "step": 5945 + }, + { + "epoch": 2.695376246600181, + "grad_norm": 0.2691697585005706, + "learning_rate": 4.152501326787415e-05, + "loss": 0.8803, + "step": 5946 + }, + { + "epoch": 2.6958295557570264, + "grad_norm": 0.26309593713291995, + "learning_rate": 4.1512363615384525e-05, + "loss": 0.8793, + "step": 5947 + }, + { + "epoch": 2.6962828649138713, + "grad_norm": 0.33416935768759093, + "learning_rate": 4.149971381142724e-05, + "loss": 0.8716, + "step": 5948 + }, + { + "epoch": 2.696736174070716, + "grad_norm": 0.2962857214515543, + "learning_rate": 4.14870638572692e-05, + "loss": 0.8717, + "step": 5949 + }, + { + "epoch": 2.6971894832275614, + "grad_norm": 0.24444598035864407, + "learning_rate": 4.1474413754177346e-05, + "loss": 0.8854, + "step": 5950 + }, + { + "epoch": 2.697642792384406, + "grad_norm": 0.2726894495346894, + "learning_rate": 4.146176350341862e-05, + "loss": 0.8823, + "step": 5951 + }, + { + "epoch": 2.698096101541251, + "grad_norm": 0.32471685385591775, + "learning_rate": 4.144911310625998e-05, + "loss": 0.8692, + "step": 5952 + }, + { + "epoch": 2.698549410698096, + "grad_norm": 0.476841940910996, + "learning_rate": 4.143646256396841e-05, + "loss": 0.894, + "step": 5953 + }, + { + "epoch": 2.699002719854941, + "grad_norm": 0.6152700218366967, + "learning_rate": 4.142381187781091e-05, + "loss": 0.8818, + "step": 5954 + }, + { + "epoch": 2.699456029011786, + "grad_norm": 0.6533981828701466, + "learning_rate": 4.1411161049054444e-05, + "loss": 0.8861, + "step": 5955 + }, + { + "epoch": 2.6999093381686308, + "grad_norm": 0.6829375722085027, + "learning_rate": 4.139851007896606e-05, + "loss": 0.8772, + "step": 5956 + }, + { + "epoch": 2.700362647325476, + "grad_norm": 0.6868927596274963, + "learning_rate": 4.138585896881278e-05, + "loss": 0.8839, + "step": 5957 + }, + { + "epoch": 2.700815956482321, + "grad_norm": 0.6509294104531982, + "learning_rate": 4.137320771986165e-05, + "loss": 0.8915, + "step": 5958 + }, + { + "epoch": 2.7012692656391657, + "grad_norm": 0.4934299027781941, + "learning_rate": 4.1360556333379725e-05, + "loss": 0.8759, + "step": 5959 + }, + { + "epoch": 2.701722574796011, + "grad_norm": 0.32363131453532934, + "learning_rate": 4.134790481063409e-05, + "loss": 0.8818, + "step": 5960 + }, + { + "epoch": 2.702175883952856, + "grad_norm": 0.2594679099221728, + "learning_rate": 4.133525315289182e-05, + "loss": 0.8756, + "step": 5961 + }, + { + "epoch": 2.7026291931097006, + "grad_norm": 0.28707011101267516, + "learning_rate": 4.132260136142003e-05, + "loss": 0.8809, + "step": 5962 + }, + { + "epoch": 2.703082502266546, + "grad_norm": 0.35410866882957076, + "learning_rate": 4.130994943748583e-05, + "loss": 0.87, + "step": 5963 + }, + { + "epoch": 2.7035358114233907, + "grad_norm": 0.4025410458385838, + "learning_rate": 4.129729738235633e-05, + "loss": 0.8543, + "step": 5964 + }, + { + "epoch": 2.7039891205802356, + "grad_norm": 0.38736078702543836, + "learning_rate": 4.128464519729869e-05, + "loss": 0.8892, + "step": 5965 + }, + { + "epoch": 2.704442429737081, + "grad_norm": 0.31652773194065925, + "learning_rate": 4.127199288358007e-05, + "loss": 0.8865, + "step": 5966 + }, + { + "epoch": 2.7048957388939256, + "grad_norm": 0.26715926681557994, + "learning_rate": 4.125934044246762e-05, + "loss": 0.89, + "step": 5967 + }, + { + "epoch": 2.7053490480507705, + "grad_norm": 0.2805074199643201, + "learning_rate": 4.124668787522852e-05, + "loss": 0.8991, + "step": 5968 + }, + { + "epoch": 2.7058023572076157, + "grad_norm": 0.31435322620047124, + "learning_rate": 4.123403518312997e-05, + "loss": 0.8791, + "step": 5969 + }, + { + "epoch": 2.7062556663644606, + "grad_norm": 0.3597502667369465, + "learning_rate": 4.122138236743917e-05, + "loss": 0.8838, + "step": 5970 + }, + { + "epoch": 2.7067089755213054, + "grad_norm": 0.45379596259184224, + "learning_rate": 4.1208729429423326e-05, + "loss": 0.8971, + "step": 5971 + }, + { + "epoch": 2.7071622846781507, + "grad_norm": 0.48961459625523035, + "learning_rate": 4.1196076370349684e-05, + "loss": 0.905, + "step": 5972 + }, + { + "epoch": 2.7076155938349955, + "grad_norm": 0.4663307338804258, + "learning_rate": 4.118342319148547e-05, + "loss": 0.8869, + "step": 5973 + }, + { + "epoch": 2.7080689029918403, + "grad_norm": 0.4891803796315549, + "learning_rate": 4.117076989409795e-05, + "loss": 0.8784, + "step": 5974 + }, + { + "epoch": 2.7085222121486856, + "grad_norm": 0.3809550715580573, + "learning_rate": 4.1158116479454386e-05, + "loss": 0.8624, + "step": 5975 + }, + { + "epoch": 2.7089755213055304, + "grad_norm": 0.25808429784087245, + "learning_rate": 4.1145462948822066e-05, + "loss": 0.8761, + "step": 5976 + }, + { + "epoch": 2.7094288304623753, + "grad_norm": 0.3820993258854631, + "learning_rate": 4.1132809303468254e-05, + "loss": 0.8585, + "step": 5977 + }, + { + "epoch": 2.7098821396192205, + "grad_norm": 0.3992701627599507, + "learning_rate": 4.112015554466025e-05, + "loss": 0.8873, + "step": 5978 + }, + { + "epoch": 2.7103354487760654, + "grad_norm": 0.345360308262866, + "learning_rate": 4.110750167366539e-05, + "loss": 0.8852, + "step": 5979 + }, + { + "epoch": 2.71078875793291, + "grad_norm": 0.38122106773967557, + "learning_rate": 4.109484769175097e-05, + "loss": 0.8947, + "step": 5980 + }, + { + "epoch": 2.7112420670897555, + "grad_norm": 0.3548071392985407, + "learning_rate": 4.108219360018435e-05, + "loss": 0.8627, + "step": 5981 + }, + { + "epoch": 2.7116953762466003, + "grad_norm": 0.33432729980403825, + "learning_rate": 4.106953940023286e-05, + "loss": 0.8915, + "step": 5982 + }, + { + "epoch": 2.712148685403445, + "grad_norm": 0.39027894162399307, + "learning_rate": 4.105688509316385e-05, + "loss": 0.8854, + "step": 5983 + }, + { + "epoch": 2.7126019945602904, + "grad_norm": 0.2745598593168288, + "learning_rate": 4.104423068024469e-05, + "loss": 0.8748, + "step": 5984 + }, + { + "epoch": 2.713055303717135, + "grad_norm": 0.31098669289486836, + "learning_rate": 4.103157616274277e-05, + "loss": 0.8879, + "step": 5985 + }, + { + "epoch": 2.71350861287398, + "grad_norm": 0.4192648460998749, + "learning_rate": 4.101892154192546e-05, + "loss": 0.8861, + "step": 5986 + }, + { + "epoch": 2.713961922030825, + "grad_norm": 0.4747606692110344, + "learning_rate": 4.1006266819060146e-05, + "loss": 0.8672, + "step": 5987 + }, + { + "epoch": 2.71441523118767, + "grad_norm": 0.43552871832219203, + "learning_rate": 4.099361199541427e-05, + "loss": 0.875, + "step": 5988 + }, + { + "epoch": 2.714868540344515, + "grad_norm": 0.34312862352767354, + "learning_rate": 4.0980957072255226e-05, + "loss": 0.8827, + "step": 5989 + }, + { + "epoch": 2.71532184950136, + "grad_norm": 0.27340115828533307, + "learning_rate": 4.096830205085045e-05, + "loss": 0.8735, + "step": 5990 + }, + { + "epoch": 2.7157751586582046, + "grad_norm": 0.24372094309815806, + "learning_rate": 4.0955646932467384e-05, + "loss": 0.8752, + "step": 5991 + }, + { + "epoch": 2.71622846781505, + "grad_norm": 0.3132058902858666, + "learning_rate": 4.094299171837346e-05, + "loss": 0.8831, + "step": 5992 + }, + { + "epoch": 2.7166817769718947, + "grad_norm": 0.32190924848444563, + "learning_rate": 4.093033640983614e-05, + "loss": 0.8599, + "step": 5993 + }, + { + "epoch": 2.7171350861287396, + "grad_norm": 0.21864989817190175, + "learning_rate": 4.091768100812291e-05, + "loss": 0.8799, + "step": 5994 + }, + { + "epoch": 2.717588395285585, + "grad_norm": 0.24132713405824183, + "learning_rate": 4.090502551450122e-05, + "loss": 0.8933, + "step": 5995 + }, + { + "epoch": 2.7180417044424297, + "grad_norm": 0.22634909298938133, + "learning_rate": 4.089236993023857e-05, + "loss": 0.8826, + "step": 5996 + }, + { + "epoch": 2.7184950135992745, + "grad_norm": 0.2517882770631009, + "learning_rate": 4.087971425660245e-05, + "loss": 0.8742, + "step": 5997 + }, + { + "epoch": 2.7189483227561198, + "grad_norm": 0.2892341763220258, + "learning_rate": 4.086705849486036e-05, + "loss": 0.8638, + "step": 5998 + }, + { + "epoch": 2.7194016319129646, + "grad_norm": 0.2568272999185166, + "learning_rate": 4.085440264627981e-05, + "loss": 0.8852, + "step": 5999 + }, + { + "epoch": 2.7198549410698094, + "grad_norm": 0.25704669699039984, + "learning_rate": 4.0841746712128325e-05, + "loss": 0.8758, + "step": 6000 + }, + { + "epoch": 2.7203082502266547, + "grad_norm": 0.23431801145560985, + "learning_rate": 4.082909069367342e-05, + "loss": 0.8671, + "step": 6001 + }, + { + "epoch": 2.7207615593834995, + "grad_norm": 0.2566072181415202, + "learning_rate": 4.081643459218266e-05, + "loss": 0.8838, + "step": 6002 + }, + { + "epoch": 2.7212148685403443, + "grad_norm": 0.28922136454837877, + "learning_rate": 4.080377840892357e-05, + "loss": 0.8847, + "step": 6003 + }, + { + "epoch": 2.7216681776971896, + "grad_norm": 0.25096970948278324, + "learning_rate": 4.079112214516372e-05, + "loss": 0.8735, + "step": 6004 + }, + { + "epoch": 2.7221214868540344, + "grad_norm": 0.27433399423181626, + "learning_rate": 4.077846580217066e-05, + "loss": 0.8802, + "step": 6005 + }, + { + "epoch": 2.7225747960108793, + "grad_norm": 0.2764745961337998, + "learning_rate": 4.076580938121196e-05, + "loss": 0.8786, + "step": 6006 + }, + { + "epoch": 2.7230281051677245, + "grad_norm": 0.28063775117628026, + "learning_rate": 4.07531528835552e-05, + "loss": 0.8829, + "step": 6007 + }, + { + "epoch": 2.7234814143245694, + "grad_norm": 0.4024887780888554, + "learning_rate": 4.074049631046796e-05, + "loss": 0.8608, + "step": 6008 + }, + { + "epoch": 2.723934723481414, + "grad_norm": 0.3277654257683327, + "learning_rate": 4.072783966321784e-05, + "loss": 0.8555, + "step": 6009 + }, + { + "epoch": 2.7243880326382595, + "grad_norm": 0.25631089012735225, + "learning_rate": 4.071518294307245e-05, + "loss": 0.8707, + "step": 6010 + }, + { + "epoch": 2.7248413417951043, + "grad_norm": 0.305820441867732, + "learning_rate": 4.070252615129937e-05, + "loss": 0.8852, + "step": 6011 + }, + { + "epoch": 2.725294650951949, + "grad_norm": 0.27306488039542304, + "learning_rate": 4.068986928916624e-05, + "loss": 0.8959, + "step": 6012 + }, + { + "epoch": 2.7257479601087944, + "grad_norm": 0.3127151038755841, + "learning_rate": 4.067721235794067e-05, + "loss": 0.89, + "step": 6013 + }, + { + "epoch": 2.726201269265639, + "grad_norm": 0.4458181374608773, + "learning_rate": 4.06645553588903e-05, + "loss": 0.8758, + "step": 6014 + }, + { + "epoch": 2.726654578422484, + "grad_norm": 0.36521408448795223, + "learning_rate": 4.065189829328275e-05, + "loss": 0.8898, + "step": 6015 + }, + { + "epoch": 2.7271078875793293, + "grad_norm": 0.2565140149743659, + "learning_rate": 4.063924116238567e-05, + "loss": 0.8988, + "step": 6016 + }, + { + "epoch": 2.727561196736174, + "grad_norm": 0.2726653368215798, + "learning_rate": 4.0626583967466716e-05, + "loss": 0.8797, + "step": 6017 + }, + { + "epoch": 2.728014505893019, + "grad_norm": 0.2508763196205694, + "learning_rate": 4.0613926709793524e-05, + "loss": 0.9033, + "step": 6018 + }, + { + "epoch": 2.7284678150498642, + "grad_norm": 0.24413803509720206, + "learning_rate": 4.060126939063377e-05, + "loss": 0.9064, + "step": 6019 + }, + { + "epoch": 2.728921124206709, + "grad_norm": 0.2708408388107043, + "learning_rate": 4.058861201125512e-05, + "loss": 0.8621, + "step": 6020 + }, + { + "epoch": 2.729374433363554, + "grad_norm": 0.24831515134784013, + "learning_rate": 4.057595457292525e-05, + "loss": 0.8851, + "step": 6021 + }, + { + "epoch": 2.729827742520399, + "grad_norm": 0.2341869493615707, + "learning_rate": 4.056329707691184e-05, + "loss": 0.8736, + "step": 6022 + }, + { + "epoch": 2.730281051677244, + "grad_norm": 0.29332459711843206, + "learning_rate": 4.055063952448257e-05, + "loss": 0.8734, + "step": 6023 + }, + { + "epoch": 2.730734360834089, + "grad_norm": 0.2802576514084309, + "learning_rate": 4.053798191690514e-05, + "loss": 0.8754, + "step": 6024 + }, + { + "epoch": 2.7311876699909337, + "grad_norm": 0.2813585848215943, + "learning_rate": 4.052532425544723e-05, + "loss": 0.8742, + "step": 6025 + }, + { + "epoch": 2.731640979147779, + "grad_norm": 0.3996362210257967, + "learning_rate": 4.0512666541376564e-05, + "loss": 0.8695, + "step": 6026 + }, + { + "epoch": 2.7320942883046238, + "grad_norm": 0.2979513800643169, + "learning_rate": 4.050000877596082e-05, + "loss": 0.845, + "step": 6027 + }, + { + "epoch": 2.7325475974614686, + "grad_norm": 0.2583773268288633, + "learning_rate": 4.048735096046774e-05, + "loss": 0.898, + "step": 6028 + }, + { + "epoch": 2.733000906618314, + "grad_norm": 0.3281898511411466, + "learning_rate": 4.047469309616502e-05, + "loss": 0.8786, + "step": 6029 + }, + { + "epoch": 2.7334542157751587, + "grad_norm": 0.2738948139296202, + "learning_rate": 4.046203518432039e-05, + "loss": 0.8697, + "step": 6030 + }, + { + "epoch": 2.7339075249320035, + "grad_norm": 0.36819865471976304, + "learning_rate": 4.044937722620159e-05, + "loss": 0.8586, + "step": 6031 + }, + { + "epoch": 2.7343608340888483, + "grad_norm": 0.3974628634450035, + "learning_rate": 4.043671922307633e-05, + "loss": 0.8604, + "step": 6032 + }, + { + "epoch": 2.7348141432456936, + "grad_norm": 0.34691672445338917, + "learning_rate": 4.0424061176212375e-05, + "loss": 0.8686, + "step": 6033 + }, + { + "epoch": 2.7352674524025384, + "grad_norm": 0.3390832423332367, + "learning_rate": 4.041140308687743e-05, + "loss": 0.8901, + "step": 6034 + }, + { + "epoch": 2.7357207615593833, + "grad_norm": 0.39510017874340586, + "learning_rate": 4.039874495633925e-05, + "loss": 0.8821, + "step": 6035 + }, + { + "epoch": 2.7361740707162285, + "grad_norm": 0.44464415841892563, + "learning_rate": 4.0386086785865606e-05, + "loss": 0.879, + "step": 6036 + }, + { + "epoch": 2.7366273798730734, + "grad_norm": 0.47535428689016745, + "learning_rate": 4.037342857672422e-05, + "loss": 0.8847, + "step": 6037 + }, + { + "epoch": 2.737080689029918, + "grad_norm": 0.5004700745636044, + "learning_rate": 4.0360770330182875e-05, + "loss": 0.8657, + "step": 6038 + }, + { + "epoch": 2.7375339981867635, + "grad_norm": 0.484323973787452, + "learning_rate": 4.034811204750933e-05, + "loss": 0.8826, + "step": 6039 + }, + { + "epoch": 2.7379873073436083, + "grad_norm": 0.37977512205228586, + "learning_rate": 4.033545372997133e-05, + "loss": 0.8881, + "step": 6040 + }, + { + "epoch": 2.738440616500453, + "grad_norm": 0.30057796371148504, + "learning_rate": 4.032279537883665e-05, + "loss": 0.8701, + "step": 6041 + }, + { + "epoch": 2.7388939256572984, + "grad_norm": 0.3009474780901464, + "learning_rate": 4.031013699537307e-05, + "loss": 0.8728, + "step": 6042 + }, + { + "epoch": 2.739347234814143, + "grad_norm": 0.2933731649829465, + "learning_rate": 4.029747858084837e-05, + "loss": 0.8802, + "step": 6043 + }, + { + "epoch": 2.739800543970988, + "grad_norm": 0.32274355039059455, + "learning_rate": 4.02848201365303e-05, + "loss": 0.8799, + "step": 6044 + }, + { + "epoch": 2.7402538531278333, + "grad_norm": 0.424830162242949, + "learning_rate": 4.027216166368665e-05, + "loss": 0.8939, + "step": 6045 + }, + { + "epoch": 2.740707162284678, + "grad_norm": 0.5180981879833223, + "learning_rate": 4.025950316358522e-05, + "loss": 0.885, + "step": 6046 + }, + { + "epoch": 2.741160471441523, + "grad_norm": 0.5080829789765267, + "learning_rate": 4.0246844637493784e-05, + "loss": 0.8766, + "step": 6047 + }, + { + "epoch": 2.7416137805983682, + "grad_norm": 0.46048977613510356, + "learning_rate": 4.0234186086680124e-05, + "loss": 0.8691, + "step": 6048 + }, + { + "epoch": 2.742067089755213, + "grad_norm": 0.42617393048165036, + "learning_rate": 4.022152751241205e-05, + "loss": 0.8795, + "step": 6049 + }, + { + "epoch": 2.742520398912058, + "grad_norm": 0.36086397079256266, + "learning_rate": 4.020886891595733e-05, + "loss": 0.8739, + "step": 6050 + }, + { + "epoch": 2.742973708068903, + "grad_norm": 0.3199442651353173, + "learning_rate": 4.019621029858379e-05, + "loss": 0.8885, + "step": 6051 + }, + { + "epoch": 2.743427017225748, + "grad_norm": 0.3862264237758552, + "learning_rate": 4.018355166155922e-05, + "loss": 0.8832, + "step": 6052 + }, + { + "epoch": 2.743880326382593, + "grad_norm": 0.3178923741147127, + "learning_rate": 4.0170893006151397e-05, + "loss": 0.8665, + "step": 6053 + }, + { + "epoch": 2.744333635539438, + "grad_norm": 0.3691506307230035, + "learning_rate": 4.015823433362815e-05, + "loss": 0.8869, + "step": 6054 + }, + { + "epoch": 2.744786944696283, + "grad_norm": 0.4683025807723327, + "learning_rate": 4.0145575645257254e-05, + "loss": 0.8645, + "step": 6055 + }, + { + "epoch": 2.7452402538531278, + "grad_norm": 0.4249543368301541, + "learning_rate": 4.0132916942306536e-05, + "loss": 0.8819, + "step": 6056 + }, + { + "epoch": 2.745693563009973, + "grad_norm": 0.34910811868057107, + "learning_rate": 4.0120258226043794e-05, + "loss": 0.8846, + "step": 6057 + }, + { + "epoch": 2.746146872166818, + "grad_norm": 0.29174255691405115, + "learning_rate": 4.0107599497736834e-05, + "loss": 0.8967, + "step": 6058 + }, + { + "epoch": 2.7466001813236627, + "grad_norm": 0.33195021402509256, + "learning_rate": 4.0094940758653466e-05, + "loss": 0.8881, + "step": 6059 + }, + { + "epoch": 2.747053490480508, + "grad_norm": 0.28131354211601683, + "learning_rate": 4.008228201006151e-05, + "loss": 0.8768, + "step": 6060 + }, + { + "epoch": 2.747506799637353, + "grad_norm": 0.2814999027192777, + "learning_rate": 4.006962325322876e-05, + "loss": 0.8808, + "step": 6061 + }, + { + "epoch": 2.7479601087941976, + "grad_norm": 0.26901645249685946, + "learning_rate": 4.005696448942305e-05, + "loss": 0.8859, + "step": 6062 + }, + { + "epoch": 2.748413417951043, + "grad_norm": 0.28180261488438185, + "learning_rate": 4.0044305719912166e-05, + "loss": 0.8804, + "step": 6063 + }, + { + "epoch": 2.7488667271078877, + "grad_norm": 0.29066273983994473, + "learning_rate": 4.003164694596394e-05, + "loss": 0.8687, + "step": 6064 + }, + { + "epoch": 2.7493200362647325, + "grad_norm": 0.3350927118538583, + "learning_rate": 4.001898816884618e-05, + "loss": 0.9013, + "step": 6065 + }, + { + "epoch": 2.7497733454215774, + "grad_norm": 0.32906964042047576, + "learning_rate": 4.000632938982669e-05, + "loss": 0.8744, + "step": 6066 + }, + { + "epoch": 2.7502266545784226, + "grad_norm": 0.30377577419761154, + "learning_rate": 3.999367061017332e-05, + "loss": 0.8796, + "step": 6067 + }, + { + "epoch": 2.7506799637352675, + "grad_norm": 0.26234732499845886, + "learning_rate": 3.9981011831153836e-05, + "loss": 0.878, + "step": 6068 + }, + { + "epoch": 2.7511332728921123, + "grad_norm": 0.27312636558240816, + "learning_rate": 3.996835305403608e-05, + "loss": 0.8655, + "step": 6069 + }, + { + "epoch": 2.751586582048957, + "grad_norm": 0.3158322359376824, + "learning_rate": 3.995569428008785e-05, + "loss": 0.8629, + "step": 6070 + }, + { + "epoch": 2.7520398912058024, + "grad_norm": 0.2814189862250625, + "learning_rate": 3.994303551057697e-05, + "loss": 0.891, + "step": 6071 + }, + { + "epoch": 2.752493200362647, + "grad_norm": 0.34044537512875755, + "learning_rate": 3.993037674677125e-05, + "loss": 0.8916, + "step": 6072 + }, + { + "epoch": 2.752946509519492, + "grad_norm": 0.48858575647914193, + "learning_rate": 3.9917717989938504e-05, + "loss": 0.8946, + "step": 6073 + }, + { + "epoch": 2.7533998186763373, + "grad_norm": 0.46308291304217525, + "learning_rate": 3.990505924134655e-05, + "loss": 0.884, + "step": 6074 + }, + { + "epoch": 2.753853127833182, + "grad_norm": 0.4208294606961422, + "learning_rate": 3.989240050226318e-05, + "loss": 0.8872, + "step": 6075 + }, + { + "epoch": 2.754306436990027, + "grad_norm": 0.31893873078045853, + "learning_rate": 3.9879741773956226e-05, + "loss": 0.8578, + "step": 6076 + }, + { + "epoch": 2.7547597461468722, + "grad_norm": 0.2711476877933372, + "learning_rate": 3.9867083057693484e-05, + "loss": 0.8823, + "step": 6077 + }, + { + "epoch": 2.755213055303717, + "grad_norm": 0.286007341675459, + "learning_rate": 3.985442435474275e-05, + "loss": 0.8895, + "step": 6078 + }, + { + "epoch": 2.755666364460562, + "grad_norm": 0.27013961638190637, + "learning_rate": 3.9841765666371864e-05, + "loss": 0.8831, + "step": 6079 + }, + { + "epoch": 2.756119673617407, + "grad_norm": 0.28091698928215575, + "learning_rate": 3.982910699384862e-05, + "loss": 0.8797, + "step": 6080 + }, + { + "epoch": 2.756572982774252, + "grad_norm": 0.3184092291863173, + "learning_rate": 3.9816448338440795e-05, + "loss": 0.89, + "step": 6081 + }, + { + "epoch": 2.757026291931097, + "grad_norm": 0.43354295808174187, + "learning_rate": 3.980378970141621e-05, + "loss": 0.8969, + "step": 6082 + }, + { + "epoch": 2.757479601087942, + "grad_norm": 0.4582094356694416, + "learning_rate": 3.979113108404266e-05, + "loss": 0.885, + "step": 6083 + }, + { + "epoch": 2.757932910244787, + "grad_norm": 0.4800149996733413, + "learning_rate": 3.977847248758795e-05, + "loss": 0.8679, + "step": 6084 + }, + { + "epoch": 2.7583862194016318, + "grad_norm": 0.5185873981838997, + "learning_rate": 3.9765813913319876e-05, + "loss": 0.8794, + "step": 6085 + }, + { + "epoch": 2.758839528558477, + "grad_norm": 0.5449660803330681, + "learning_rate": 3.9753155362506236e-05, + "loss": 0.8832, + "step": 6086 + }, + { + "epoch": 2.759292837715322, + "grad_norm": 1.61244897869741, + "learning_rate": 3.97404968364148e-05, + "loss": 0.8814, + "step": 6087 + }, + { + "epoch": 2.7597461468721667, + "grad_norm": 0.4149699311590512, + "learning_rate": 3.972783833631337e-05, + "loss": 0.8966, + "step": 6088 + }, + { + "epoch": 2.760199456029012, + "grad_norm": 0.9749459304727475, + "learning_rate": 3.971517986346972e-05, + "loss": 0.8984, + "step": 6089 + }, + { + "epoch": 2.760652765185857, + "grad_norm": 1.2112411355866706, + "learning_rate": 3.970252141915166e-05, + "loss": 0.8967, + "step": 6090 + }, + { + "epoch": 2.7611060743427016, + "grad_norm": 0.6375532886505565, + "learning_rate": 3.968986300462694e-05, + "loss": 0.8974, + "step": 6091 + }, + { + "epoch": 2.761559383499547, + "grad_norm": 0.48718199656381356, + "learning_rate": 3.9677204621163356e-05, + "loss": 0.8917, + "step": 6092 + }, + { + "epoch": 2.7620126926563917, + "grad_norm": 0.7600020928208014, + "learning_rate": 3.966454627002868e-05, + "loss": 0.8744, + "step": 6093 + }, + { + "epoch": 2.7624660018132365, + "grad_norm": 0.9781315396251884, + "learning_rate": 3.965188795249068e-05, + "loss": 0.8725, + "step": 6094 + }, + { + "epoch": 2.762919310970082, + "grad_norm": 1.0689970495876113, + "learning_rate": 3.963922966981713e-05, + "loss": 0.8886, + "step": 6095 + }, + { + "epoch": 2.7633726201269266, + "grad_norm": 0.7023912189087739, + "learning_rate": 3.9626571423275786e-05, + "loss": 0.8465, + "step": 6096 + }, + { + "epoch": 2.7638259292837715, + "grad_norm": 0.3962577348791257, + "learning_rate": 3.961391321413441e-05, + "loss": 0.8748, + "step": 6097 + }, + { + "epoch": 2.7642792384406167, + "grad_norm": 0.5829269724998474, + "learning_rate": 3.9601255043660754e-05, + "loss": 0.8666, + "step": 6098 + }, + { + "epoch": 2.7647325475974616, + "grad_norm": 0.8311273728198666, + "learning_rate": 3.958859691312259e-05, + "loss": 0.8663, + "step": 6099 + }, + { + "epoch": 2.7651858567543064, + "grad_norm": 0.8797845088640909, + "learning_rate": 3.9575938823787645e-05, + "loss": 0.8871, + "step": 6100 + }, + { + "epoch": 2.7656391659111517, + "grad_norm": 0.789216204999004, + "learning_rate": 3.956328077692367e-05, + "loss": 0.8889, + "step": 6101 + }, + { + "epoch": 2.7660924750679965, + "grad_norm": 0.6502077164978416, + "learning_rate": 3.955062277379842e-05, + "loss": 0.8974, + "step": 6102 + }, + { + "epoch": 2.7665457842248413, + "grad_norm": 0.5984201887237827, + "learning_rate": 3.9537964815679604e-05, + "loss": 0.8791, + "step": 6103 + }, + { + "epoch": 2.766999093381686, + "grad_norm": 0.5387715994898872, + "learning_rate": 3.952530690383498e-05, + "loss": 0.8736, + "step": 6104 + }, + { + "epoch": 2.7674524025385314, + "grad_norm": 0.4861922022254606, + "learning_rate": 3.951264903953228e-05, + "loss": 0.873, + "step": 6105 + }, + { + "epoch": 2.7679057116953762, + "grad_norm": 0.4148610397037762, + "learning_rate": 3.94999912240392e-05, + "loss": 0.8822, + "step": 6106 + }, + { + "epoch": 2.768359020852221, + "grad_norm": 0.38139518568773456, + "learning_rate": 3.948733345862346e-05, + "loss": 0.8744, + "step": 6107 + }, + { + "epoch": 2.7688123300090663, + "grad_norm": 0.4898444247025592, + "learning_rate": 3.9474675744552784e-05, + "loss": 0.8879, + "step": 6108 + }, + { + "epoch": 2.769265639165911, + "grad_norm": 0.4827180763493824, + "learning_rate": 3.946201808309487e-05, + "loss": 0.8696, + "step": 6109 + }, + { + "epoch": 2.769718948322756, + "grad_norm": 0.31831927196581833, + "learning_rate": 3.944936047551744e-05, + "loss": 0.879, + "step": 6110 + }, + { + "epoch": 2.770172257479601, + "grad_norm": 0.3605813220292502, + "learning_rate": 3.943670292308818e-05, + "loss": 0.8735, + "step": 6111 + }, + { + "epoch": 2.770625566636446, + "grad_norm": 0.45535451402338695, + "learning_rate": 3.942404542707476e-05, + "loss": 0.8862, + "step": 6112 + }, + { + "epoch": 2.771078875793291, + "grad_norm": 0.48364775537798727, + "learning_rate": 3.9411387988744884e-05, + "loss": 0.8919, + "step": 6113 + }, + { + "epoch": 2.7715321849501358, + "grad_norm": 0.3915572350916613, + "learning_rate": 3.939873060936624e-05, + "loss": 0.9184, + "step": 6114 + }, + { + "epoch": 2.771985494106981, + "grad_norm": 0.2546997901684052, + "learning_rate": 3.938607329020649e-05, + "loss": 0.8799, + "step": 6115 + }, + { + "epoch": 2.772438803263826, + "grad_norm": 0.35701423346277816, + "learning_rate": 3.9373416032533304e-05, + "loss": 0.8822, + "step": 6116 + }, + { + "epoch": 2.7728921124206707, + "grad_norm": 0.459885609354512, + "learning_rate": 3.936075883761434e-05, + "loss": 0.8649, + "step": 6117 + }, + { + "epoch": 2.773345421577516, + "grad_norm": 0.3895671268739787, + "learning_rate": 3.9348101706717264e-05, + "loss": 0.8765, + "step": 6118 + }, + { + "epoch": 2.773798730734361, + "grad_norm": 0.29711750235080303, + "learning_rate": 3.933544464110971e-05, + "loss": 0.8805, + "step": 6119 + }, + { + "epoch": 2.7742520398912056, + "grad_norm": 0.33102378608482685, + "learning_rate": 3.932278764205933e-05, + "loss": 0.8935, + "step": 6120 + }, + { + "epoch": 2.774705349048051, + "grad_norm": 0.387540393145134, + "learning_rate": 3.9310130710833766e-05, + "loss": 0.8839, + "step": 6121 + }, + { + "epoch": 2.7751586582048957, + "grad_norm": 0.31192713768429314, + "learning_rate": 3.9297473848700634e-05, + "loss": 0.8822, + "step": 6122 + }, + { + "epoch": 2.7756119673617405, + "grad_norm": 0.2943472179179411, + "learning_rate": 3.928481705692756e-05, + "loss": 0.8682, + "step": 6123 + }, + { + "epoch": 2.776065276518586, + "grad_norm": 0.30992521161847514, + "learning_rate": 3.927216033678216e-05, + "loss": 0.8698, + "step": 6124 + }, + { + "epoch": 2.7765185856754306, + "grad_norm": 0.312090975400174, + "learning_rate": 3.925950368953205e-05, + "loss": 0.902, + "step": 6125 + }, + { + "epoch": 2.7769718948322755, + "grad_norm": 0.36026862712502605, + "learning_rate": 3.9246847116444825e-05, + "loss": 0.8987, + "step": 6126 + }, + { + "epoch": 2.7774252039891207, + "grad_norm": 0.28153717326456584, + "learning_rate": 3.923419061878806e-05, + "loss": 0.8864, + "step": 6127 + }, + { + "epoch": 2.7778785131459656, + "grad_norm": 0.246974994925805, + "learning_rate": 3.922153419782936e-05, + "loss": 0.8801, + "step": 6128 + }, + { + "epoch": 2.7783318223028104, + "grad_norm": 0.41156265462301106, + "learning_rate": 3.920887785483629e-05, + "loss": 0.8659, + "step": 6129 + }, + { + "epoch": 2.7787851314596557, + "grad_norm": 0.2938772884923773, + "learning_rate": 3.9196221591076436e-05, + "loss": 0.868, + "step": 6130 + }, + { + "epoch": 2.7792384406165005, + "grad_norm": 0.24135577232290767, + "learning_rate": 3.918356540781735e-05, + "loss": 0.8866, + "step": 6131 + }, + { + "epoch": 2.7796917497733453, + "grad_norm": 0.3151626258550177, + "learning_rate": 3.9170909306326585e-05, + "loss": 0.8802, + "step": 6132 + }, + { + "epoch": 2.7801450589301906, + "grad_norm": 0.2714048701529538, + "learning_rate": 3.9158253287871695e-05, + "loss": 0.8665, + "step": 6133 + }, + { + "epoch": 2.7805983680870354, + "grad_norm": 0.28143568582548667, + "learning_rate": 3.9145597353720205e-05, + "loss": 0.8781, + "step": 6134 + }, + { + "epoch": 2.7810516772438802, + "grad_norm": 0.26700452760329224, + "learning_rate": 3.9132941505139657e-05, + "loss": 0.8828, + "step": 6135 + }, + { + "epoch": 2.7815049864007255, + "grad_norm": 0.29366619317950715, + "learning_rate": 3.9120285743397556e-05, + "loss": 0.8823, + "step": 6136 + }, + { + "epoch": 2.7819582955575703, + "grad_norm": 0.3681690020464256, + "learning_rate": 3.9107630069761444e-05, + "loss": 0.886, + "step": 6137 + }, + { + "epoch": 2.782411604714415, + "grad_norm": 0.335748527886002, + "learning_rate": 3.909497448549879e-05, + "loss": 0.885, + "step": 6138 + }, + { + "epoch": 2.7828649138712604, + "grad_norm": 0.27099686766519443, + "learning_rate": 3.90823189918771e-05, + "loss": 0.896, + "step": 6139 + }, + { + "epoch": 2.7833182230281053, + "grad_norm": 0.2938789639924382, + "learning_rate": 3.906966359016386e-05, + "loss": 0.8829, + "step": 6140 + }, + { + "epoch": 2.78377153218495, + "grad_norm": 0.3390834258039275, + "learning_rate": 3.9057008281626547e-05, + "loss": 0.883, + "step": 6141 + }, + { + "epoch": 2.7842248413417954, + "grad_norm": 0.410045986622981, + "learning_rate": 3.904435306753262e-05, + "loss": 0.8946, + "step": 6142 + }, + { + "epoch": 2.78467815049864, + "grad_norm": 0.4478655141436937, + "learning_rate": 3.903169794914955e-05, + "loss": 0.8764, + "step": 6143 + }, + { + "epoch": 2.785131459655485, + "grad_norm": 0.40303359472285216, + "learning_rate": 3.9019042927744794e-05, + "loss": 0.8894, + "step": 6144 + }, + { + "epoch": 2.78558476881233, + "grad_norm": 0.2771535642765404, + "learning_rate": 3.900638800458575e-05, + "loss": 0.8769, + "step": 6145 + }, + { + "epoch": 2.786038077969175, + "grad_norm": 0.2806444203616299, + "learning_rate": 3.899373318093987e-05, + "loss": 0.8942, + "step": 6146 + }, + { + "epoch": 2.78649138712602, + "grad_norm": 0.3123559904952473, + "learning_rate": 3.898107845807457e-05, + "loss": 0.8611, + "step": 6147 + }, + { + "epoch": 2.786944696282865, + "grad_norm": 0.3296136432659561, + "learning_rate": 3.896842383725725e-05, + "loss": 0.8687, + "step": 6148 + }, + { + "epoch": 2.7873980054397096, + "grad_norm": 0.23674826384797926, + "learning_rate": 3.895576931975532e-05, + "loss": 0.8612, + "step": 6149 + }, + { + "epoch": 2.787851314596555, + "grad_norm": 0.27717242273124054, + "learning_rate": 3.8943114906836165e-05, + "loss": 0.894, + "step": 6150 + }, + { + "epoch": 2.7883046237533997, + "grad_norm": 0.3448819430387344, + "learning_rate": 3.893046059976715e-05, + "loss": 0.8835, + "step": 6151 + }, + { + "epoch": 2.7887579329102445, + "grad_norm": 0.32659024885264465, + "learning_rate": 3.8917806399815663e-05, + "loss": 0.8669, + "step": 6152 + }, + { + "epoch": 2.78921124206709, + "grad_norm": 0.2710175205427032, + "learning_rate": 3.8905152308249035e-05, + "loss": 0.8561, + "step": 6153 + }, + { + "epoch": 2.7896645512239346, + "grad_norm": 0.25669349458594337, + "learning_rate": 3.8892498326334624e-05, + "loss": 0.8627, + "step": 6154 + }, + { + "epoch": 2.7901178603807795, + "grad_norm": 0.3140769530615051, + "learning_rate": 3.887984445533976e-05, + "loss": 0.8847, + "step": 6155 + }, + { + "epoch": 2.7905711695376247, + "grad_norm": 0.33710422621512065, + "learning_rate": 3.886719069653176e-05, + "loss": 0.8861, + "step": 6156 + }, + { + "epoch": 2.7910244786944696, + "grad_norm": 0.3234057570260941, + "learning_rate": 3.885453705117795e-05, + "loss": 0.8714, + "step": 6157 + }, + { + "epoch": 2.7914777878513144, + "grad_norm": 0.30133393000181624, + "learning_rate": 3.8841883520545614e-05, + "loss": 0.885, + "step": 6158 + }, + { + "epoch": 2.7919310970081597, + "grad_norm": 0.25134542832365, + "learning_rate": 3.882923010590205e-05, + "loss": 0.8825, + "step": 6159 + }, + { + "epoch": 2.7923844061650045, + "grad_norm": 0.23043113982136806, + "learning_rate": 3.881657680851453e-05, + "loss": 0.8864, + "step": 6160 + }, + { + "epoch": 2.7928377153218493, + "grad_norm": 0.29487972959925135, + "learning_rate": 3.880392362965032e-05, + "loss": 0.8939, + "step": 6161 + }, + { + "epoch": 2.7932910244786946, + "grad_norm": 0.332644057338735, + "learning_rate": 3.879127057057668e-05, + "loss": 0.8916, + "step": 6162 + }, + { + "epoch": 2.7937443336355394, + "grad_norm": 0.33783798896329925, + "learning_rate": 3.877861763256085e-05, + "loss": 0.8643, + "step": 6163 + }, + { + "epoch": 2.7941976427923843, + "grad_norm": 0.2773299515842344, + "learning_rate": 3.876596481687005e-05, + "loss": 0.8568, + "step": 6164 + }, + { + "epoch": 2.7946509519492295, + "grad_norm": 0.2805342412672011, + "learning_rate": 3.875331212477149e-05, + "loss": 0.8781, + "step": 6165 + }, + { + "epoch": 2.7951042611060744, + "grad_norm": 0.30141720781108783, + "learning_rate": 3.874065955753239e-05, + "loss": 0.8887, + "step": 6166 + }, + { + "epoch": 2.795557570262919, + "grad_norm": 0.3343178801376389, + "learning_rate": 3.872800711641994e-05, + "loss": 0.8694, + "step": 6167 + }, + { + "epoch": 2.7960108794197644, + "grad_norm": 0.35514662073302267, + "learning_rate": 3.8715354802701316e-05, + "loss": 0.8857, + "step": 6168 + }, + { + "epoch": 2.7964641885766093, + "grad_norm": 0.3379789257917202, + "learning_rate": 3.870270261764368e-05, + "loss": 0.8708, + "step": 6169 + }, + { + "epoch": 2.796917497733454, + "grad_norm": 0.2923678471476138, + "learning_rate": 3.869005056251419e-05, + "loss": 0.8907, + "step": 6170 + }, + { + "epoch": 2.7973708068902994, + "grad_norm": 0.2939770955730156, + "learning_rate": 3.867739863857998e-05, + "loss": 0.8885, + "step": 6171 + }, + { + "epoch": 2.797824116047144, + "grad_norm": 0.3090077960359177, + "learning_rate": 3.8664746847108186e-05, + "loss": 0.8849, + "step": 6172 + }, + { + "epoch": 2.798277425203989, + "grad_norm": 0.2596366671302071, + "learning_rate": 3.8652095189365927e-05, + "loss": 0.8797, + "step": 6173 + }, + { + "epoch": 2.7987307343608343, + "grad_norm": 0.25571024845196366, + "learning_rate": 3.863944366662029e-05, + "loss": 0.8797, + "step": 6174 + }, + { + "epoch": 2.799184043517679, + "grad_norm": 0.2692914071282436, + "learning_rate": 3.8626792280138365e-05, + "loss": 0.8873, + "step": 6175 + }, + { + "epoch": 2.799637352674524, + "grad_norm": 0.30313137173987253, + "learning_rate": 3.861414103118723e-05, + "loss": 0.897, + "step": 6176 + }, + { + "epoch": 2.8000906618313692, + "grad_norm": 0.2881362476597029, + "learning_rate": 3.860148992103395e-05, + "loss": 0.8943, + "step": 6177 + }, + { + "epoch": 2.800543970988214, + "grad_norm": 0.28451432846342567, + "learning_rate": 3.858883895094557e-05, + "loss": 0.8717, + "step": 6178 + }, + { + "epoch": 2.800997280145059, + "grad_norm": 0.262528505015299, + "learning_rate": 3.857618812218911e-05, + "loss": 0.8822, + "step": 6179 + }, + { + "epoch": 2.801450589301904, + "grad_norm": 0.2485769702666244, + "learning_rate": 3.856353743603159e-05, + "loss": 0.8685, + "step": 6180 + }, + { + "epoch": 2.801903898458749, + "grad_norm": 0.27653706007492673, + "learning_rate": 3.8550886893740017e-05, + "loss": 0.8883, + "step": 6181 + }, + { + "epoch": 2.802357207615594, + "grad_norm": 0.30835300270246063, + "learning_rate": 3.853823649658139e-05, + "loss": 0.8852, + "step": 6182 + }, + { + "epoch": 2.8028105167724386, + "grad_norm": 0.335182763315055, + "learning_rate": 3.8525586245822674e-05, + "loss": 0.8785, + "step": 6183 + }, + { + "epoch": 2.803263825929284, + "grad_norm": 0.3163777460339215, + "learning_rate": 3.851293614273082e-05, + "loss": 0.8838, + "step": 6184 + }, + { + "epoch": 2.8037171350861287, + "grad_norm": 0.29676650264929405, + "learning_rate": 3.8500286188572787e-05, + "loss": 0.8716, + "step": 6185 + }, + { + "epoch": 2.8041704442429736, + "grad_norm": 0.30821578475297845, + "learning_rate": 3.8487636384615495e-05, + "loss": 0.8612, + "step": 6186 + }, + { + "epoch": 2.804623753399819, + "grad_norm": 0.2796440368362066, + "learning_rate": 3.847498673212586e-05, + "loss": 0.8849, + "step": 6187 + }, + { + "epoch": 2.8050770625566637, + "grad_norm": 0.23638863451854217, + "learning_rate": 3.846233723237079e-05, + "loss": 0.8847, + "step": 6188 + }, + { + "epoch": 2.8055303717135085, + "grad_norm": 0.23022938303700224, + "learning_rate": 3.844968788661717e-05, + "loss": 0.8745, + "step": 6189 + }, + { + "epoch": 2.8059836808703533, + "grad_norm": 0.4368934830553695, + "learning_rate": 3.843703869613186e-05, + "loss": 0.8921, + "step": 6190 + }, + { + "epoch": 2.8064369900271986, + "grad_norm": 0.23328925312044513, + "learning_rate": 3.8424389662181716e-05, + "loss": 0.8889, + "step": 6191 + }, + { + "epoch": 2.8068902991840434, + "grad_norm": 0.29137212697585135, + "learning_rate": 3.841174078603358e-05, + "loss": 0.8841, + "step": 6192 + }, + { + "epoch": 2.8073436083408883, + "grad_norm": 0.2893179856620248, + "learning_rate": 3.839909206895428e-05, + "loss": 0.8744, + "step": 6193 + }, + { + "epoch": 2.8077969174977335, + "grad_norm": 0.3614889637573261, + "learning_rate": 3.838644351221061e-05, + "loss": 0.8823, + "step": 6194 + }, + { + "epoch": 2.8082502266545784, + "grad_norm": 0.4064216447541747, + "learning_rate": 3.8373795117069353e-05, + "loss": 0.8815, + "step": 6195 + }, + { + "epoch": 2.808703535811423, + "grad_norm": 0.4177631749934668, + "learning_rate": 3.836114688479731e-05, + "loss": 0.8812, + "step": 6196 + }, + { + "epoch": 2.8091568449682685, + "grad_norm": 0.38040884353135246, + "learning_rate": 3.834849881666121e-05, + "loss": 0.8871, + "step": 6197 + }, + { + "epoch": 2.8096101541251133, + "grad_norm": 0.2930489773960027, + "learning_rate": 3.833585091392781e-05, + "loss": 0.8609, + "step": 6198 + }, + { + "epoch": 2.810063463281958, + "grad_norm": 0.1999170860645616, + "learning_rate": 3.8323203177863835e-05, + "loss": 0.9078, + "step": 6199 + }, + { + "epoch": 2.8105167724388034, + "grad_norm": 0.2683079445666463, + "learning_rate": 3.831055560973599e-05, + "loss": 0.857, + "step": 6200 + }, + { + "epoch": 2.810970081595648, + "grad_norm": 0.428376504917596, + "learning_rate": 3.829790821081098e-05, + "loss": 0.8656, + "step": 6201 + }, + { + "epoch": 2.811423390752493, + "grad_norm": 0.5010854410041042, + "learning_rate": 3.828526098235545e-05, + "loss": 0.8831, + "step": 6202 + }, + { + "epoch": 2.8118766999093383, + "grad_norm": 0.5393705638104727, + "learning_rate": 3.827261392563607e-05, + "loss": 0.9013, + "step": 6203 + }, + { + "epoch": 2.812330009066183, + "grad_norm": 0.5392988909854911, + "learning_rate": 3.8259967041919484e-05, + "loss": 0.9039, + "step": 6204 + }, + { + "epoch": 2.812783318223028, + "grad_norm": 0.5095681789973199, + "learning_rate": 3.8247320332472314e-05, + "loss": 0.8655, + "step": 6205 + }, + { + "epoch": 2.8132366273798732, + "grad_norm": 0.3780104885461652, + "learning_rate": 3.823467379856116e-05, + "loss": 0.8754, + "step": 6206 + }, + { + "epoch": 2.813689936536718, + "grad_norm": 0.2917276115405788, + "learning_rate": 3.8222027441452615e-05, + "loss": 0.8898, + "step": 6207 + }, + { + "epoch": 2.814143245693563, + "grad_norm": 0.3022022508112911, + "learning_rate": 3.8209381262413255e-05, + "loss": 0.879, + "step": 6208 + }, + { + "epoch": 2.814596554850408, + "grad_norm": 0.34773465479375837, + "learning_rate": 3.819673526270962e-05, + "loss": 0.8652, + "step": 6209 + }, + { + "epoch": 2.815049864007253, + "grad_norm": 0.45927220068067093, + "learning_rate": 3.818408944360824e-05, + "loss": 0.8877, + "step": 6210 + }, + { + "epoch": 2.815503173164098, + "grad_norm": 0.4759305535510418, + "learning_rate": 3.8171443806375646e-05, + "loss": 0.8871, + "step": 6211 + }, + { + "epoch": 2.815956482320943, + "grad_norm": 0.44779298687251967, + "learning_rate": 3.8158798352278325e-05, + "loss": 0.8764, + "step": 6212 + }, + { + "epoch": 2.816409791477788, + "grad_norm": 0.2702659621621992, + "learning_rate": 3.814615308258277e-05, + "loss": 0.8767, + "step": 6213 + }, + { + "epoch": 2.8168631006346327, + "grad_norm": 0.2852363483009499, + "learning_rate": 3.813350799855542e-05, + "loss": 0.8909, + "step": 6214 + }, + { + "epoch": 2.817316409791478, + "grad_norm": 0.35091935373662, + "learning_rate": 3.812086310146273e-05, + "loss": 0.8853, + "step": 6215 + }, + { + "epoch": 2.817769718948323, + "grad_norm": 0.40681800255926726, + "learning_rate": 3.8108218392571126e-05, + "loss": 0.9045, + "step": 6216 + }, + { + "epoch": 2.8182230281051677, + "grad_norm": 0.3911211144816597, + "learning_rate": 3.8095573873147015e-05, + "loss": 0.8691, + "step": 6217 + }, + { + "epoch": 2.818676337262013, + "grad_norm": 0.3285192226252565, + "learning_rate": 3.808292954445677e-05, + "loss": 0.8549, + "step": 6218 + }, + { + "epoch": 2.8191296464188578, + "grad_norm": 0.27314813560742146, + "learning_rate": 3.8070285407766776e-05, + "loss": 0.8725, + "step": 6219 + }, + { + "epoch": 2.8195829555757026, + "grad_norm": 0.31124059788624614, + "learning_rate": 3.8057641464343365e-05, + "loss": 0.8897, + "step": 6220 + }, + { + "epoch": 2.820036264732548, + "grad_norm": 0.36702274758081815, + "learning_rate": 3.8044997715452875e-05, + "loss": 0.896, + "step": 6221 + }, + { + "epoch": 2.8204895738893927, + "grad_norm": 0.4996803018547748, + "learning_rate": 3.80323541623616e-05, + "loss": 0.8729, + "step": 6222 + }, + { + "epoch": 2.8209428830462375, + "grad_norm": 0.43250589029538605, + "learning_rate": 3.801971080633584e-05, + "loss": 0.8765, + "step": 6223 + }, + { + "epoch": 2.8213961922030824, + "grad_norm": 0.28847073333203394, + "learning_rate": 3.800706764864187e-05, + "loss": 0.8837, + "step": 6224 + }, + { + "epoch": 2.8218495013599276, + "grad_norm": 0.24532349484706276, + "learning_rate": 3.799442469054593e-05, + "loss": 0.8848, + "step": 6225 + }, + { + "epoch": 2.8223028105167725, + "grad_norm": 0.24532081050856266, + "learning_rate": 3.798178193331424e-05, + "loss": 0.8586, + "step": 6226 + }, + { + "epoch": 2.8227561196736173, + "grad_norm": 0.23437649157619464, + "learning_rate": 3.796913937821303e-05, + "loss": 0.8685, + "step": 6227 + }, + { + "epoch": 2.823209428830462, + "grad_norm": 0.291817385359558, + "learning_rate": 3.795649702650848e-05, + "loss": 0.8744, + "step": 6228 + }, + { + "epoch": 2.8236627379873074, + "grad_norm": 0.3280127296310442, + "learning_rate": 3.794385487946675e-05, + "loss": 0.8924, + "step": 6229 + }, + { + "epoch": 2.824116047144152, + "grad_norm": 0.2908610842445327, + "learning_rate": 3.7931212938354e-05, + "loss": 0.9065, + "step": 6230 + }, + { + "epoch": 2.824569356300997, + "grad_norm": 0.3007160205929872, + "learning_rate": 3.7918571204436356e-05, + "loss": 0.8898, + "step": 6231 + }, + { + "epoch": 2.8250226654578423, + "grad_norm": 0.2737593372447412, + "learning_rate": 3.790592967897992e-05, + "loss": 0.8761, + "step": 6232 + }, + { + "epoch": 2.825475974614687, + "grad_norm": 0.24773687036020287, + "learning_rate": 3.789328836325079e-05, + "loss": 0.889, + "step": 6233 + }, + { + "epoch": 2.825929283771532, + "grad_norm": 0.20511684350673856, + "learning_rate": 3.788064725851502e-05, + "loss": 0.8574, + "step": 6234 + }, + { + "epoch": 2.8263825929283772, + "grad_norm": 0.23218064487773796, + "learning_rate": 3.7868006366038655e-05, + "loss": 0.8814, + "step": 6235 + }, + { + "epoch": 2.826835902085222, + "grad_norm": 0.2599007257179044, + "learning_rate": 3.785536568708772e-05, + "loss": 0.9084, + "step": 6236 + }, + { + "epoch": 2.827289211242067, + "grad_norm": 0.24672088643078985, + "learning_rate": 3.784272522292821e-05, + "loss": 0.8897, + "step": 6237 + }, + { + "epoch": 2.827742520398912, + "grad_norm": 0.19809760665439363, + "learning_rate": 3.783008497482611e-05, + "loss": 0.8752, + "step": 6238 + }, + { + "epoch": 2.828195829555757, + "grad_norm": 0.27670828318173174, + "learning_rate": 3.781744494404737e-05, + "loss": 0.8824, + "step": 6239 + }, + { + "epoch": 2.828649138712602, + "grad_norm": 0.3131078938642687, + "learning_rate": 3.780480513185796e-05, + "loss": 0.8544, + "step": 6240 + }, + { + "epoch": 2.829102447869447, + "grad_norm": 0.36709082513300295, + "learning_rate": 3.7792165539523746e-05, + "loss": 0.8867, + "step": 6241 + }, + { + "epoch": 2.829555757026292, + "grad_norm": 0.34880320190774944, + "learning_rate": 3.777952616831064e-05, + "loss": 0.8775, + "step": 6242 + }, + { + "epoch": 2.8300090661831367, + "grad_norm": 0.2603695762823064, + "learning_rate": 3.776688701948452e-05, + "loss": 0.8823, + "step": 6243 + }, + { + "epoch": 2.830462375339982, + "grad_norm": 0.2060386247460911, + "learning_rate": 3.775424809431122e-05, + "loss": 0.8776, + "step": 6244 + }, + { + "epoch": 2.830915684496827, + "grad_norm": 0.2685247516109654, + "learning_rate": 3.7741609394056575e-05, + "loss": 0.8846, + "step": 6245 + }, + { + "epoch": 2.8313689936536717, + "grad_norm": 0.3386479515299825, + "learning_rate": 3.772897091998639e-05, + "loss": 0.8772, + "step": 6246 + }, + { + "epoch": 2.831822302810517, + "grad_norm": 0.3921982492768128, + "learning_rate": 3.7716332673366444e-05, + "loss": 0.8677, + "step": 6247 + }, + { + "epoch": 2.8322756119673618, + "grad_norm": 0.4310789573093802, + "learning_rate": 3.7703694655462494e-05, + "loss": 0.8955, + "step": 6248 + }, + { + "epoch": 2.8327289211242066, + "grad_norm": 0.38824678844625143, + "learning_rate": 3.769105686754027e-05, + "loss": 0.8852, + "step": 6249 + }, + { + "epoch": 2.833182230281052, + "grad_norm": 0.26469265223788385, + "learning_rate": 3.767841931086549e-05, + "loss": 0.8754, + "step": 6250 + }, + { + "epoch": 2.8336355394378967, + "grad_norm": 0.31789888068902666, + "learning_rate": 3.766578198670383e-05, + "loss": 0.8736, + "step": 6251 + }, + { + "epoch": 2.8340888485947415, + "grad_norm": 0.4011095908883563, + "learning_rate": 3.765314489632097e-05, + "loss": 0.8681, + "step": 6252 + }, + { + "epoch": 2.834542157751587, + "grad_norm": 0.3573363892956795, + "learning_rate": 3.7640508040982546e-05, + "loss": 0.8883, + "step": 6253 + }, + { + "epoch": 2.8349954669084316, + "grad_norm": 0.34144527486145515, + "learning_rate": 3.762787142195417e-05, + "loss": 0.8761, + "step": 6254 + }, + { + "epoch": 2.8354487760652765, + "grad_norm": 0.29531961191026723, + "learning_rate": 3.761523504050145e-05, + "loss": 0.877, + "step": 6255 + }, + { + "epoch": 2.8359020852221217, + "grad_norm": 0.26980909636436434, + "learning_rate": 3.760259889788995e-05, + "loss": 0.8821, + "step": 6256 + }, + { + "epoch": 2.8363553943789666, + "grad_norm": 0.2996649452134225, + "learning_rate": 3.7589962995385205e-05, + "loss": 0.8665, + "step": 6257 + }, + { + "epoch": 2.8368087035358114, + "grad_norm": 0.27870481897620536, + "learning_rate": 3.7577327334252764e-05, + "loss": 0.8614, + "step": 6258 + }, + { + "epoch": 2.8372620126926567, + "grad_norm": 0.3348327288805529, + "learning_rate": 3.7564691915758116e-05, + "loss": 0.892, + "step": 6259 + }, + { + "epoch": 2.8377153218495015, + "grad_norm": 0.42172275808060417, + "learning_rate": 3.755205674116672e-05, + "loss": 0.8881, + "step": 6260 + }, + { + "epoch": 2.8381686310063463, + "grad_norm": 0.36075592528277733, + "learning_rate": 3.753942181174403e-05, + "loss": 0.8745, + "step": 6261 + }, + { + "epoch": 2.838621940163191, + "grad_norm": 0.2812676544712938, + "learning_rate": 3.7526787128755485e-05, + "loss": 0.8934, + "step": 6262 + }, + { + "epoch": 2.8390752493200364, + "grad_norm": 0.5061876298899771, + "learning_rate": 3.7514152693466466e-05, + "loss": 0.8931, + "step": 6263 + }, + { + "epoch": 2.8395285584768812, + "grad_norm": 0.3477799639927694, + "learning_rate": 3.750151850714237e-05, + "loss": 0.8805, + "step": 6264 + }, + { + "epoch": 2.839981867633726, + "grad_norm": 0.35464610370986027, + "learning_rate": 3.748888457104854e-05, + "loss": 0.8876, + "step": 6265 + }, + { + "epoch": 2.840435176790571, + "grad_norm": 0.2738637658997471, + "learning_rate": 3.74762508864503e-05, + "loss": 0.8968, + "step": 6266 + }, + { + "epoch": 2.840888485947416, + "grad_norm": 0.29829559191317934, + "learning_rate": 3.746361745461295e-05, + "loss": 0.8811, + "step": 6267 + }, + { + "epoch": 2.841341795104261, + "grad_norm": 0.34403429940806535, + "learning_rate": 3.745098427680176e-05, + "loss": 0.8829, + "step": 6268 + }, + { + "epoch": 2.841795104261106, + "grad_norm": 0.3941126470873288, + "learning_rate": 3.743835135428198e-05, + "loss": 0.8674, + "step": 6269 + }, + { + "epoch": 2.842248413417951, + "grad_norm": 0.3668782995004719, + "learning_rate": 3.742571868831886e-05, + "loss": 0.8857, + "step": 6270 + }, + { + "epoch": 2.842701722574796, + "grad_norm": 0.3214644948270158, + "learning_rate": 3.7413086280177565e-05, + "loss": 0.895, + "step": 6271 + }, + { + "epoch": 2.8431550317316407, + "grad_norm": 0.29563649227104344, + "learning_rate": 3.7400454131123285e-05, + "loss": 0.884, + "step": 6272 + }, + { + "epoch": 2.843608340888486, + "grad_norm": 0.2878551864573817, + "learning_rate": 3.738782224242116e-05, + "loss": 0.8807, + "step": 6273 + }, + { + "epoch": 2.844061650045331, + "grad_norm": 0.2696224241260502, + "learning_rate": 3.737519061533632e-05, + "loss": 0.8685, + "step": 6274 + }, + { + "epoch": 2.8445149592021757, + "grad_norm": 0.33592952048804314, + "learning_rate": 3.736255925113386e-05, + "loss": 0.8943, + "step": 6275 + }, + { + "epoch": 2.844968268359021, + "grad_norm": 0.40597289648771173, + "learning_rate": 3.734992815107884e-05, + "loss": 0.8914, + "step": 6276 + }, + { + "epoch": 2.8454215775158658, + "grad_norm": 0.37716085120558634, + "learning_rate": 3.73372973164363e-05, + "loss": 0.8591, + "step": 6277 + }, + { + "epoch": 2.8458748866727106, + "grad_norm": 0.2897183110481195, + "learning_rate": 3.7324666748471264e-05, + "loss": 0.8859, + "step": 6278 + }, + { + "epoch": 2.846328195829556, + "grad_norm": 0.2576485231785989, + "learning_rate": 3.731203644844871e-05, + "loss": 0.8927, + "step": 6279 + }, + { + "epoch": 2.8467815049864007, + "grad_norm": 0.2452021030567583, + "learning_rate": 3.729940641763361e-05, + "loss": 0.8758, + "step": 6280 + }, + { + "epoch": 2.8472348141432455, + "grad_norm": 0.28607442556225193, + "learning_rate": 3.728677665729089e-05, + "loss": 0.8896, + "step": 6281 + }, + { + "epoch": 2.847688123300091, + "grad_norm": 0.3214490384250522, + "learning_rate": 3.7274147168685464e-05, + "loss": 0.8927, + "step": 6282 + }, + { + "epoch": 2.8481414324569356, + "grad_norm": 0.37352354712729513, + "learning_rate": 3.7261517953082206e-05, + "loss": 0.8759, + "step": 6283 + }, + { + "epoch": 2.8485947416137805, + "grad_norm": 0.27539596545444023, + "learning_rate": 3.724888901174598e-05, + "loss": 0.8652, + "step": 6284 + }, + { + "epoch": 2.8490480507706257, + "grad_norm": 0.2652966719311011, + "learning_rate": 3.7236260345941597e-05, + "loss": 0.8806, + "step": 6285 + }, + { + "epoch": 2.8495013599274706, + "grad_norm": 0.3163811564094791, + "learning_rate": 3.7223631956933865e-05, + "loss": 0.8878, + "step": 6286 + }, + { + "epoch": 2.8499546690843154, + "grad_norm": 0.37719536039829077, + "learning_rate": 3.7211003845987554e-05, + "loss": 0.8634, + "step": 6287 + }, + { + "epoch": 2.8504079782411607, + "grad_norm": 0.46281040638736143, + "learning_rate": 3.71983760143674e-05, + "loss": 0.8866, + "step": 6288 + }, + { + "epoch": 2.8508612873980055, + "grad_norm": 0.43791821127773856, + "learning_rate": 3.7185748463338125e-05, + "loss": 0.8761, + "step": 6289 + }, + { + "epoch": 2.8513145965548503, + "grad_norm": 0.38288676416212447, + "learning_rate": 3.717312119416441e-05, + "loss": 0.9054, + "step": 6290 + }, + { + "epoch": 2.8517679057116956, + "grad_norm": 0.262836324048374, + "learning_rate": 3.716049420811093e-05, + "loss": 0.8841, + "step": 6291 + }, + { + "epoch": 2.8522212148685404, + "grad_norm": 0.22946279365217115, + "learning_rate": 3.714786750644229e-05, + "loss": 0.868, + "step": 6292 + }, + { + "epoch": 2.8526745240253852, + "grad_norm": 0.2952245452247361, + "learning_rate": 3.71352410904231e-05, + "loss": 0.8795, + "step": 6293 + }, + { + "epoch": 2.8531278331822305, + "grad_norm": 0.3115938555648703, + "learning_rate": 3.7122614961317933e-05, + "loss": 0.8658, + "step": 6294 + }, + { + "epoch": 2.8535811423390753, + "grad_norm": 0.3404640950903566, + "learning_rate": 3.710998912039133e-05, + "loss": 0.8914, + "step": 6295 + }, + { + "epoch": 2.85403445149592, + "grad_norm": 0.3267461483452965, + "learning_rate": 3.7097363568907816e-05, + "loss": 0.8929, + "step": 6296 + }, + { + "epoch": 2.8544877606527654, + "grad_norm": 0.29753155805147846, + "learning_rate": 3.708473830813189e-05, + "loss": 0.8632, + "step": 6297 + }, + { + "epoch": 2.8549410698096103, + "grad_norm": 0.20472107916473223, + "learning_rate": 3.7072113339327974e-05, + "loss": 0.8735, + "step": 6298 + }, + { + "epoch": 2.855394378966455, + "grad_norm": 0.27413961151924865, + "learning_rate": 3.70594886637605e-05, + "loss": 0.8727, + "step": 6299 + }, + { + "epoch": 2.8558476881233004, + "grad_norm": 0.35177233379527045, + "learning_rate": 3.704686428269389e-05, + "loss": 0.8583, + "step": 6300 + }, + { + "epoch": 2.856300997280145, + "grad_norm": 0.41580167868503376, + "learning_rate": 3.70342401973925e-05, + "loss": 0.9026, + "step": 6301 + }, + { + "epoch": 2.85675430643699, + "grad_norm": 0.439719704858271, + "learning_rate": 3.702161640912067e-05, + "loss": 0.8731, + "step": 6302 + }, + { + "epoch": 2.857207615593835, + "grad_norm": 0.4799418083369069, + "learning_rate": 3.700899291914271e-05, + "loss": 0.8898, + "step": 6303 + }, + { + "epoch": 2.85766092475068, + "grad_norm": 0.46536076797512116, + "learning_rate": 3.6996369728722894e-05, + "loss": 0.8783, + "step": 6304 + }, + { + "epoch": 2.858114233907525, + "grad_norm": 0.3477333549154172, + "learning_rate": 3.6983746839125484e-05, + "loss": 0.8688, + "step": 6305 + }, + { + "epoch": 2.8585675430643698, + "grad_norm": 0.23541193593965412, + "learning_rate": 3.697112425161469e-05, + "loss": 0.8931, + "step": 6306 + }, + { + "epoch": 2.8590208522212146, + "grad_norm": 0.21850855811286388, + "learning_rate": 3.695850196745471e-05, + "loss": 0.869, + "step": 6307 + }, + { + "epoch": 2.85947416137806, + "grad_norm": 0.2927248837224789, + "learning_rate": 3.694587998790969e-05, + "loss": 0.8809, + "step": 6308 + }, + { + "epoch": 2.8599274705349047, + "grad_norm": 0.3466992101047666, + "learning_rate": 3.693325831424377e-05, + "loss": 0.8915, + "step": 6309 + }, + { + "epoch": 2.8603807796917495, + "grad_norm": 0.3289405722423622, + "learning_rate": 3.692063694772103e-05, + "loss": 0.8834, + "step": 6310 + }, + { + "epoch": 2.860834088848595, + "grad_norm": 0.3356347253659286, + "learning_rate": 3.690801588960556e-05, + "loss": 0.8776, + "step": 6311 + }, + { + "epoch": 2.8612873980054396, + "grad_norm": 0.370906959629084, + "learning_rate": 3.689539514116138e-05, + "loss": 0.8795, + "step": 6312 + }, + { + "epoch": 2.8617407071622845, + "grad_norm": 0.3480855504001318, + "learning_rate": 3.688277470365251e-05, + "loss": 0.8923, + "step": 6313 + }, + { + "epoch": 2.8621940163191297, + "grad_norm": 0.2487543819913017, + "learning_rate": 3.687015457834291e-05, + "loss": 0.8667, + "step": 6314 + }, + { + "epoch": 2.8626473254759746, + "grad_norm": 0.22926115375153683, + "learning_rate": 3.685753476649652e-05, + "loss": 0.8893, + "step": 6315 + }, + { + "epoch": 2.8631006346328194, + "grad_norm": 0.26712157164907346, + "learning_rate": 3.684491526937727e-05, + "loss": 0.8811, + "step": 6316 + }, + { + "epoch": 2.8635539437896647, + "grad_norm": 0.2495154246269076, + "learning_rate": 3.683229608824904e-05, + "loss": 0.8725, + "step": 6317 + }, + { + "epoch": 2.8640072529465095, + "grad_norm": 0.24827682003181847, + "learning_rate": 3.681967722437565e-05, + "loss": 0.9063, + "step": 6318 + }, + { + "epoch": 2.8644605621033543, + "grad_norm": 0.23707730356135825, + "learning_rate": 3.680705867902094e-05, + "loss": 0.8763, + "step": 6319 + }, + { + "epoch": 2.8649138712601996, + "grad_norm": 0.22896748830063648, + "learning_rate": 3.679444045344868e-05, + "loss": 0.881, + "step": 6320 + }, + { + "epoch": 2.8653671804170444, + "grad_norm": 0.28462730993425167, + "learning_rate": 3.678182254892263e-05, + "loss": 0.8881, + "step": 6321 + }, + { + "epoch": 2.8658204895738892, + "grad_norm": 0.30289393149063654, + "learning_rate": 3.676920496670652e-05, + "loss": 0.8735, + "step": 6322 + }, + { + "epoch": 2.8662737987307345, + "grad_norm": 0.29819103012196185, + "learning_rate": 3.675658770806402e-05, + "loss": 0.887, + "step": 6323 + }, + { + "epoch": 2.8667271078875793, + "grad_norm": 0.28584942376146233, + "learning_rate": 3.674397077425878e-05, + "loss": 0.8998, + "step": 6324 + }, + { + "epoch": 2.867180417044424, + "grad_norm": 0.28785731620865845, + "learning_rate": 3.673135416655445e-05, + "loss": 0.8991, + "step": 6325 + }, + { + "epoch": 2.8676337262012694, + "grad_norm": 0.27771398714984835, + "learning_rate": 3.6718737886214603e-05, + "loss": 0.8835, + "step": 6326 + }, + { + "epoch": 2.8680870353581143, + "grad_norm": 0.23394322669575454, + "learning_rate": 3.67061219345028e-05, + "loss": 0.8751, + "step": 6327 + }, + { + "epoch": 2.868540344514959, + "grad_norm": 0.216354254893085, + "learning_rate": 3.669350631268257e-05, + "loss": 0.8789, + "step": 6328 + }, + { + "epoch": 2.8689936536718044, + "grad_norm": 0.2196127411437509, + "learning_rate": 3.6680891022017393e-05, + "loss": 0.8801, + "step": 6329 + }, + { + "epoch": 2.869446962828649, + "grad_norm": 0.25009155676805717, + "learning_rate": 3.666827606377074e-05, + "loss": 0.8968, + "step": 6330 + }, + { + "epoch": 2.869900271985494, + "grad_norm": 0.22181081780704917, + "learning_rate": 3.665566143920603e-05, + "loss": 0.8631, + "step": 6331 + }, + { + "epoch": 2.8703535811423393, + "grad_norm": 0.2401624691008328, + "learning_rate": 3.664304714958666e-05, + "loss": 0.8786, + "step": 6332 + }, + { + "epoch": 2.870806890299184, + "grad_norm": 0.2530652788361371, + "learning_rate": 3.6630433196175986e-05, + "loss": 0.8612, + "step": 6333 + }, + { + "epoch": 2.871260199456029, + "grad_norm": 0.288148201724063, + "learning_rate": 3.661781958023732e-05, + "loss": 0.9066, + "step": 6334 + }, + { + "epoch": 2.871713508612874, + "grad_norm": 0.2953000109058463, + "learning_rate": 3.660520630303397e-05, + "loss": 0.8666, + "step": 6335 + }, + { + "epoch": 2.872166817769719, + "grad_norm": 0.3480165814656687, + "learning_rate": 3.659259336582919e-05, + "loss": 0.8924, + "step": 6336 + }, + { + "epoch": 2.872620126926564, + "grad_norm": 0.3496121385249228, + "learning_rate": 3.65799807698862e-05, + "loss": 0.8746, + "step": 6337 + }, + { + "epoch": 2.873073436083409, + "grad_norm": 0.25649684481088075, + "learning_rate": 3.656736851646818e-05, + "loss": 0.8881, + "step": 6338 + }, + { + "epoch": 2.873526745240254, + "grad_norm": 0.3076757980472284, + "learning_rate": 3.655475660683829e-05, + "loss": 0.866, + "step": 6339 + }, + { + "epoch": 2.873980054397099, + "grad_norm": 0.28855880526667743, + "learning_rate": 3.6542145042259646e-05, + "loss": 0.8705, + "step": 6340 + }, + { + "epoch": 2.8744333635539436, + "grad_norm": 0.3246766033441759, + "learning_rate": 3.6529533823995345e-05, + "loss": 0.8576, + "step": 6341 + }, + { + "epoch": 2.874886672710789, + "grad_norm": 0.3841388912452113, + "learning_rate": 3.651692295330843e-05, + "loss": 0.8605, + "step": 6342 + }, + { + "epoch": 2.8753399818676337, + "grad_norm": 0.3158763249914843, + "learning_rate": 3.6504312431461915e-05, + "loss": 0.8621, + "step": 6343 + }, + { + "epoch": 2.8757932910244786, + "grad_norm": 0.27794266589594857, + "learning_rate": 3.649170225971879e-05, + "loss": 0.8772, + "step": 6344 + }, + { + "epoch": 2.8762466001813234, + "grad_norm": 0.21065640038123754, + "learning_rate": 3.647909243934199e-05, + "loss": 0.9021, + "step": 6345 + }, + { + "epoch": 2.8766999093381687, + "grad_norm": 0.3168525273267963, + "learning_rate": 3.646648297159443e-05, + "loss": 0.8695, + "step": 6346 + }, + { + "epoch": 2.8771532184950135, + "grad_norm": 0.2774121546152305, + "learning_rate": 3.645387385773899e-05, + "loss": 0.8723, + "step": 6347 + }, + { + "epoch": 2.8776065276518583, + "grad_norm": 0.33754360912613807, + "learning_rate": 3.6441265099038505e-05, + "loss": 0.9026, + "step": 6348 + }, + { + "epoch": 2.8780598368087036, + "grad_norm": 0.35199113383002884, + "learning_rate": 3.6428656696755776e-05, + "loss": 0.8909, + "step": 6349 + }, + { + "epoch": 2.8785131459655484, + "grad_norm": 0.36792467466337675, + "learning_rate": 3.641604865215357e-05, + "loss": 0.902, + "step": 6350 + }, + { + "epoch": 2.8789664551223932, + "grad_norm": 0.41492917775958943, + "learning_rate": 3.640344096649463e-05, + "loss": 0.8908, + "step": 6351 + }, + { + "epoch": 2.8794197642792385, + "grad_norm": 0.32883697188020866, + "learning_rate": 3.639083364104165e-05, + "loss": 0.9063, + "step": 6352 + }, + { + "epoch": 2.8798730734360833, + "grad_norm": 0.27831450452523354, + "learning_rate": 3.6378226677057275e-05, + "loss": 0.8853, + "step": 6353 + }, + { + "epoch": 2.880326382592928, + "grad_norm": 0.29400043060434755, + "learning_rate": 3.6365620075804154e-05, + "loss": 0.8934, + "step": 6354 + }, + { + "epoch": 2.8807796917497734, + "grad_norm": 0.29764652047053286, + "learning_rate": 3.6353013838544875e-05, + "loss": 0.8786, + "step": 6355 + }, + { + "epoch": 2.8812330009066183, + "grad_norm": 0.3247965590856455, + "learning_rate": 3.6340407966541965e-05, + "loss": 0.8671, + "step": 6356 + }, + { + "epoch": 2.881686310063463, + "grad_norm": 0.3650130929798007, + "learning_rate": 3.6327802461057957e-05, + "loss": 0.8819, + "step": 6357 + }, + { + "epoch": 2.8821396192203084, + "grad_norm": 0.3791008337664259, + "learning_rate": 3.6315197323355315e-05, + "loss": 0.8778, + "step": 6358 + }, + { + "epoch": 2.882592928377153, + "grad_norm": 0.3432307584957783, + "learning_rate": 3.630259255469649e-05, + "loss": 0.8764, + "step": 6359 + }, + { + "epoch": 2.883046237533998, + "grad_norm": 0.29557551701593, + "learning_rate": 3.62899881563439e-05, + "loss": 0.8814, + "step": 6360 + }, + { + "epoch": 2.8834995466908433, + "grad_norm": 0.24959614340291658, + "learning_rate": 3.6277384129559885e-05, + "loss": 0.8888, + "step": 6361 + }, + { + "epoch": 2.883952855847688, + "grad_norm": 0.23213880849470264, + "learning_rate": 3.62647804756068e-05, + "loss": 0.871, + "step": 6362 + }, + { + "epoch": 2.884406165004533, + "grad_norm": 0.30701338939245953, + "learning_rate": 3.625217719574694e-05, + "loss": 0.8872, + "step": 6363 + }, + { + "epoch": 2.8848594741613782, + "grad_norm": 0.32319995782543204, + "learning_rate": 3.623957429124253e-05, + "loss": 0.8788, + "step": 6364 + }, + { + "epoch": 2.885312783318223, + "grad_norm": 0.38715919946982913, + "learning_rate": 3.622697176335581e-05, + "loss": 0.896, + "step": 6365 + }, + { + "epoch": 2.885766092475068, + "grad_norm": 0.42892896615837206, + "learning_rate": 3.621436961334895e-05, + "loss": 0.9029, + "step": 6366 + }, + { + "epoch": 2.886219401631913, + "grad_norm": 0.44017872638992084, + "learning_rate": 3.620176784248411e-05, + "loss": 0.8867, + "step": 6367 + }, + { + "epoch": 2.886672710788758, + "grad_norm": 0.308218895376885, + "learning_rate": 3.618916645202338e-05, + "loss": 0.8743, + "step": 6368 + }, + { + "epoch": 2.887126019945603, + "grad_norm": 0.29151149170787855, + "learning_rate": 3.617656544322883e-05, + "loss": 0.8815, + "step": 6369 + }, + { + "epoch": 2.887579329102448, + "grad_norm": 0.30140716819053076, + "learning_rate": 3.616396481736248e-05, + "loss": 0.8896, + "step": 6370 + }, + { + "epoch": 2.888032638259293, + "grad_norm": 0.2779971764988013, + "learning_rate": 3.615136457568633e-05, + "loss": 0.8818, + "step": 6371 + }, + { + "epoch": 2.8884859474161377, + "grad_norm": 0.380363988564186, + "learning_rate": 3.613876471946233e-05, + "loss": 0.9106, + "step": 6372 + }, + { + "epoch": 2.888939256572983, + "grad_norm": 0.3806783958355455, + "learning_rate": 3.612616524995239e-05, + "loss": 0.9036, + "step": 6373 + }, + { + "epoch": 2.889392565729828, + "grad_norm": 0.33805340697311, + "learning_rate": 3.611356616841841e-05, + "loss": 0.884, + "step": 6374 + }, + { + "epoch": 2.8898458748866727, + "grad_norm": 0.30890674844812716, + "learning_rate": 3.610096747612218e-05, + "loss": 0.8829, + "step": 6375 + }, + { + "epoch": 2.890299184043518, + "grad_norm": 0.28887758729643676, + "learning_rate": 3.608836917432552e-05, + "loss": 0.869, + "step": 6376 + }, + { + "epoch": 2.8907524932003628, + "grad_norm": 0.31048455154557336, + "learning_rate": 3.6075771264290175e-05, + "loss": 0.8784, + "step": 6377 + }, + { + "epoch": 2.8912058023572076, + "grad_norm": 0.2993933942534631, + "learning_rate": 3.606317374727789e-05, + "loss": 0.8837, + "step": 6378 + }, + { + "epoch": 2.891659111514053, + "grad_norm": 0.2753874464467583, + "learning_rate": 3.60505766245503e-05, + "loss": 0.9017, + "step": 6379 + }, + { + "epoch": 2.8921124206708977, + "grad_norm": 0.24541447399100424, + "learning_rate": 3.603797989736908e-05, + "loss": 0.8865, + "step": 6380 + }, + { + "epoch": 2.8925657298277425, + "grad_norm": 0.27285323655088006, + "learning_rate": 3.6025383566995814e-05, + "loss": 0.8757, + "step": 6381 + }, + { + "epoch": 2.8930190389845873, + "grad_norm": 0.28166966992657017, + "learning_rate": 3.6012787634692067e-05, + "loss": 0.8705, + "step": 6382 + }, + { + "epoch": 2.8934723481414326, + "grad_norm": 0.20567609403983206, + "learning_rate": 3.600019210171935e-05, + "loss": 0.8817, + "step": 6383 + }, + { + "epoch": 2.8939256572982774, + "grad_norm": 0.2898402138421751, + "learning_rate": 3.5987596969339155e-05, + "loss": 0.857, + "step": 6384 + }, + { + "epoch": 2.8943789664551223, + "grad_norm": 0.32718548200030195, + "learning_rate": 3.5975002238812915e-05, + "loss": 0.8759, + "step": 6385 + }, + { + "epoch": 2.894832275611967, + "grad_norm": 0.2646266896284652, + "learning_rate": 3.596240791140203e-05, + "loss": 0.8996, + "step": 6386 + }, + { + "epoch": 2.8952855847688124, + "grad_norm": 0.3151603110243066, + "learning_rate": 3.594981398836786e-05, + "loss": 0.8775, + "step": 6387 + }, + { + "epoch": 2.895738893925657, + "grad_norm": 0.2566423396483705, + "learning_rate": 3.593722047097172e-05, + "loss": 0.8458, + "step": 6388 + }, + { + "epoch": 2.896192203082502, + "grad_norm": 0.22397654715690227, + "learning_rate": 3.59246273604749e-05, + "loss": 0.8712, + "step": 6389 + }, + { + "epoch": 2.8966455122393473, + "grad_norm": 0.3095059507777026, + "learning_rate": 3.5912034658138614e-05, + "loss": 0.8639, + "step": 6390 + }, + { + "epoch": 2.897098821396192, + "grad_norm": 0.27376865650650495, + "learning_rate": 3.5899442365224085e-05, + "loss": 0.8953, + "step": 6391 + }, + { + "epoch": 2.897552130553037, + "grad_norm": 0.30369811464748847, + "learning_rate": 3.588685048299244e-05, + "loss": 0.8903, + "step": 6392 + }, + { + "epoch": 2.8980054397098822, + "grad_norm": 0.21928197081046782, + "learning_rate": 3.587425901270482e-05, + "loss": 0.8576, + "step": 6393 + }, + { + "epoch": 2.898458748866727, + "grad_norm": 0.3793386461073858, + "learning_rate": 3.5861667955622285e-05, + "loss": 0.881, + "step": 6394 + }, + { + "epoch": 2.898912058023572, + "grad_norm": 0.4035355649482683, + "learning_rate": 3.584907731300586e-05, + "loss": 0.8946, + "step": 6395 + }, + { + "epoch": 2.899365367180417, + "grad_norm": 0.34515478344247774, + "learning_rate": 3.5836487086116545e-05, + "loss": 0.8767, + "step": 6396 + }, + { + "epoch": 2.899818676337262, + "grad_norm": 0.33815331941033583, + "learning_rate": 3.5823897276215275e-05, + "loss": 0.8802, + "step": 6397 + }, + { + "epoch": 2.900271985494107, + "grad_norm": 0.3311395030125725, + "learning_rate": 3.581130788456297e-05, + "loss": 0.8838, + "step": 6398 + }, + { + "epoch": 2.900725294650952, + "grad_norm": 0.317354991862968, + "learning_rate": 3.57987189124205e-05, + "loss": 0.8746, + "step": 6399 + }, + { + "epoch": 2.901178603807797, + "grad_norm": 0.2716284646236348, + "learning_rate": 3.578613036104867e-05, + "loss": 0.8681, + "step": 6400 + }, + { + "epoch": 2.9016319129646417, + "grad_norm": 0.24803530680959643, + "learning_rate": 3.577354223170827e-05, + "loss": 0.8724, + "step": 6401 + }, + { + "epoch": 2.902085222121487, + "grad_norm": 0.33963048419794684, + "learning_rate": 3.5760954525660034e-05, + "loss": 0.874, + "step": 6402 + }, + { + "epoch": 2.902538531278332, + "grad_norm": 0.36402417196410397, + "learning_rate": 3.574836724416466e-05, + "loss": 0.8794, + "step": 6403 + }, + { + "epoch": 2.9029918404351767, + "grad_norm": 0.3503129974976808, + "learning_rate": 3.5735780388482814e-05, + "loss": 0.8778, + "step": 6404 + }, + { + "epoch": 2.903445149592022, + "grad_norm": 0.23457880740039175, + "learning_rate": 3.572319395987508e-05, + "loss": 0.8825, + "step": 6405 + }, + { + "epoch": 2.9038984587488668, + "grad_norm": 0.30207153492196326, + "learning_rate": 3.571060795960205e-05, + "loss": 0.8678, + "step": 6406 + }, + { + "epoch": 2.9043517679057116, + "grad_norm": 0.34292672151823855, + "learning_rate": 3.5698022388924234e-05, + "loss": 0.8795, + "step": 6407 + }, + { + "epoch": 2.904805077062557, + "grad_norm": 0.33893742598845283, + "learning_rate": 3.568543724910212e-05, + "loss": 0.8804, + "step": 6408 + }, + { + "epoch": 2.9052583862194017, + "grad_norm": 0.27523417127446637, + "learning_rate": 3.567285254139614e-05, + "loss": 0.88, + "step": 6409 + }, + { + "epoch": 2.9057116953762465, + "grad_norm": 0.3549873360601669, + "learning_rate": 3.5660268267066704e-05, + "loss": 0.883, + "step": 6410 + }, + { + "epoch": 2.906165004533092, + "grad_norm": 0.3961023841704148, + "learning_rate": 3.564768442737415e-05, + "loss": 0.8741, + "step": 6411 + }, + { + "epoch": 2.9066183136899366, + "grad_norm": 0.27088835819864204, + "learning_rate": 3.563510102357879e-05, + "loss": 0.8975, + "step": 6412 + }, + { + "epoch": 2.9070716228467814, + "grad_norm": 0.41831031464303836, + "learning_rate": 3.562251805694092e-05, + "loss": 0.8778, + "step": 6413 + }, + { + "epoch": 2.9075249320036267, + "grad_norm": 0.4356867228770331, + "learning_rate": 3.56099355287207e-05, + "loss": 0.8728, + "step": 6414 + }, + { + "epoch": 2.9079782411604715, + "grad_norm": 0.3214463454165222, + "learning_rate": 3.559735344017834e-05, + "loss": 0.8892, + "step": 6415 + }, + { + "epoch": 2.9084315503173164, + "grad_norm": 0.32201918389461204, + "learning_rate": 3.558477179257398e-05, + "loss": 0.8872, + "step": 6416 + }, + { + "epoch": 2.9088848594741616, + "grad_norm": 0.32681153159712895, + "learning_rate": 3.55721905871677e-05, + "loss": 0.877, + "step": 6417 + }, + { + "epoch": 2.9093381686310065, + "grad_norm": 0.36667003254771235, + "learning_rate": 3.555960982521955e-05, + "loss": 0.8547, + "step": 6418 + }, + { + "epoch": 2.9097914777878513, + "grad_norm": 0.2994164021643649, + "learning_rate": 3.5547029507989514e-05, + "loss": 0.8722, + "step": 6419 + }, + { + "epoch": 2.910244786944696, + "grad_norm": 0.26927911246212644, + "learning_rate": 3.5534449636737574e-05, + "loss": 0.8899, + "step": 6420 + }, + { + "epoch": 2.9106980961015414, + "grad_norm": 0.3300970441425962, + "learning_rate": 3.552187021272362e-05, + "loss": 0.9058, + "step": 6421 + }, + { + "epoch": 2.9111514052583862, + "grad_norm": 0.3894729980846933, + "learning_rate": 3.550929123720752e-05, + "loss": 0.8681, + "step": 6422 + }, + { + "epoch": 2.911604714415231, + "grad_norm": 0.34176334918939694, + "learning_rate": 3.54967127114491e-05, + "loss": 0.886, + "step": 6423 + }, + { + "epoch": 2.912058023572076, + "grad_norm": 0.35737407972166374, + "learning_rate": 3.548413463670814e-05, + "loss": 0.8861, + "step": 6424 + }, + { + "epoch": 2.912511332728921, + "grad_norm": 0.32542371867146713, + "learning_rate": 3.5471557014244374e-05, + "loss": 0.9, + "step": 6425 + }, + { + "epoch": 2.912964641885766, + "grad_norm": 0.3186876140364769, + "learning_rate": 3.545897984531748e-05, + "loss": 0.8624, + "step": 6426 + }, + { + "epoch": 2.913417951042611, + "grad_norm": 0.3300064008147913, + "learning_rate": 3.5446403131187096e-05, + "loss": 0.8815, + "step": 6427 + }, + { + "epoch": 2.913871260199456, + "grad_norm": 0.2275166425921681, + "learning_rate": 3.5433826873112825e-05, + "loss": 0.8812, + "step": 6428 + }, + { + "epoch": 2.914324569356301, + "grad_norm": 0.2715178095290377, + "learning_rate": 3.542125107235421e-05, + "loss": 0.8512, + "step": 6429 + }, + { + "epoch": 2.9147778785131457, + "grad_norm": 0.25898933743105096, + "learning_rate": 3.540867573017076e-05, + "loss": 0.8738, + "step": 6430 + }, + { + "epoch": 2.915231187669991, + "grad_norm": 0.258385520701323, + "learning_rate": 3.5396100847821926e-05, + "loss": 0.8837, + "step": 6431 + }, + { + "epoch": 2.915684496826836, + "grad_norm": 0.2632362791364958, + "learning_rate": 3.538352642656713e-05, + "loss": 0.8728, + "step": 6432 + }, + { + "epoch": 2.9161378059836807, + "grad_norm": 0.28445866835784195, + "learning_rate": 3.537095246766573e-05, + "loss": 0.8836, + "step": 6433 + }, + { + "epoch": 2.916591115140526, + "grad_norm": 0.33024389982193597, + "learning_rate": 3.535837897237703e-05, + "loss": 0.8784, + "step": 6434 + }, + { + "epoch": 2.9170444242973708, + "grad_norm": 0.3386311507459742, + "learning_rate": 3.534580594196033e-05, + "loss": 0.8943, + "step": 6435 + }, + { + "epoch": 2.9174977334542156, + "grad_norm": 0.29164658915367797, + "learning_rate": 3.533323337767484e-05, + "loss": 0.8734, + "step": 6436 + }, + { + "epoch": 2.917951042611061, + "grad_norm": 0.25295522028057926, + "learning_rate": 3.532066128077975e-05, + "loss": 0.8799, + "step": 6437 + }, + { + "epoch": 2.9184043517679057, + "grad_norm": 0.18969304589536276, + "learning_rate": 3.530808965253417e-05, + "loss": 0.8907, + "step": 6438 + }, + { + "epoch": 2.9188576609247505, + "grad_norm": 0.2780606186215746, + "learning_rate": 3.529551849419721e-05, + "loss": 0.8604, + "step": 6439 + }, + { + "epoch": 2.919310970081596, + "grad_norm": 0.24413917023290496, + "learning_rate": 3.528294780702789e-05, + "loss": 0.8826, + "step": 6440 + }, + { + "epoch": 2.9197642792384406, + "grad_norm": 0.23046913452223125, + "learning_rate": 3.527037759228522e-05, + "loss": 0.8732, + "step": 6441 + }, + { + "epoch": 2.9202175883952854, + "grad_norm": 0.30132926086009726, + "learning_rate": 3.5257807851228124e-05, + "loss": 0.8927, + "step": 6442 + }, + { + "epoch": 2.9206708975521307, + "grad_norm": 0.3309530568749677, + "learning_rate": 3.5245238585115516e-05, + "loss": 0.8799, + "step": 6443 + }, + { + "epoch": 2.9211242067089755, + "grad_norm": 0.33417389195789926, + "learning_rate": 3.5232669795206234e-05, + "loss": 0.8874, + "step": 6444 + }, + { + "epoch": 2.9215775158658204, + "grad_norm": 0.3186769422208477, + "learning_rate": 3.522010148275909e-05, + "loss": 0.86, + "step": 6445 + }, + { + "epoch": 2.9220308250226656, + "grad_norm": 0.2766732833796192, + "learning_rate": 3.520753364903284e-05, + "loss": 0.873, + "step": 6446 + }, + { + "epoch": 2.9224841341795105, + "grad_norm": 0.26589909832121783, + "learning_rate": 3.519496629528616e-05, + "loss": 0.8833, + "step": 6447 + }, + { + "epoch": 2.9229374433363553, + "grad_norm": 0.22973576910848856, + "learning_rate": 3.518239942277773e-05, + "loss": 0.8855, + "step": 6448 + }, + { + "epoch": 2.9233907524932006, + "grad_norm": 0.30160472775862907, + "learning_rate": 3.516983303276616e-05, + "loss": 0.8824, + "step": 6449 + }, + { + "epoch": 2.9238440616500454, + "grad_norm": 0.3821567894698147, + "learning_rate": 3.515726712651001e-05, + "loss": 0.8803, + "step": 6450 + }, + { + "epoch": 2.9242973708068902, + "grad_norm": 0.3625653477757634, + "learning_rate": 3.5144701705267806e-05, + "loss": 0.8772, + "step": 6451 + }, + { + "epoch": 2.9247506799637355, + "grad_norm": 0.29061210049838543, + "learning_rate": 3.5132136770297977e-05, + "loss": 0.889, + "step": 6452 + }, + { + "epoch": 2.9252039891205803, + "grad_norm": 0.19838058252142846, + "learning_rate": 3.511957232285895e-05, + "loss": 0.8684, + "step": 6453 + }, + { + "epoch": 2.925657298277425, + "grad_norm": 0.2366551246398075, + "learning_rate": 3.510700836420911e-05, + "loss": 0.8883, + "step": 6454 + }, + { + "epoch": 2.9261106074342704, + "grad_norm": 0.3355602945349227, + "learning_rate": 3.509444489560675e-05, + "loss": 0.8731, + "step": 6455 + }, + { + "epoch": 2.9265639165911153, + "grad_norm": 0.3820506441063505, + "learning_rate": 3.508188191831016e-05, + "loss": 0.8762, + "step": 6456 + }, + { + "epoch": 2.92701722574796, + "grad_norm": 0.343597135716649, + "learning_rate": 3.506931943357755e-05, + "loss": 0.8684, + "step": 6457 + }, + { + "epoch": 2.927470534904805, + "grad_norm": 0.328180144108336, + "learning_rate": 3.5056757442667084e-05, + "loss": 0.8796, + "step": 6458 + }, + { + "epoch": 2.92792384406165, + "grad_norm": 0.3418191136501641, + "learning_rate": 3.5044195946836886e-05, + "loss": 0.8808, + "step": 6459 + }, + { + "epoch": 2.928377153218495, + "grad_norm": 0.2972731756983337, + "learning_rate": 3.503163494734504e-05, + "loss": 0.8837, + "step": 6460 + }, + { + "epoch": 2.92883046237534, + "grad_norm": 0.3633853697863633, + "learning_rate": 3.501907444544955e-05, + "loss": 0.8778, + "step": 6461 + }, + { + "epoch": 2.929283771532185, + "grad_norm": 0.31179456247022513, + "learning_rate": 3.50065144424084e-05, + "loss": 0.8798, + "step": 6462 + }, + { + "epoch": 2.92973708068903, + "grad_norm": 0.2835402164428079, + "learning_rate": 3.499395493947949e-05, + "loss": 0.8796, + "step": 6463 + }, + { + "epoch": 2.9301903898458748, + "grad_norm": 0.2727225567340506, + "learning_rate": 3.498139593792072e-05, + "loss": 0.873, + "step": 6464 + }, + { + "epoch": 2.9306436990027196, + "grad_norm": 0.2419535166034816, + "learning_rate": 3.4968837438989886e-05, + "loss": 0.8835, + "step": 6465 + }, + { + "epoch": 2.931097008159565, + "grad_norm": 0.25281810478796696, + "learning_rate": 3.495627944394477e-05, + "loss": 0.8642, + "step": 6466 + }, + { + "epoch": 2.9315503173164097, + "grad_norm": 0.22416206098743827, + "learning_rate": 3.494372195404309e-05, + "loss": 0.8604, + "step": 6467 + }, + { + "epoch": 2.9320036264732545, + "grad_norm": 0.26246182727274975, + "learning_rate": 3.493116497054252e-05, + "loss": 0.8876, + "step": 6468 + }, + { + "epoch": 2.9324569356301, + "grad_norm": 0.25406053204010337, + "learning_rate": 3.491860849470067e-05, + "loss": 0.8654, + "step": 6469 + }, + { + "epoch": 2.9329102447869446, + "grad_norm": 0.24027442949370648, + "learning_rate": 3.490605252777514e-05, + "loss": 0.8674, + "step": 6470 + }, + { + "epoch": 2.9333635539437894, + "grad_norm": 0.23561531463647947, + "learning_rate": 3.489349707102339e-05, + "loss": 0.8877, + "step": 6471 + }, + { + "epoch": 2.9338168631006347, + "grad_norm": 0.3127458491015143, + "learning_rate": 3.488094212570293e-05, + "loss": 0.8804, + "step": 6472 + }, + { + "epoch": 2.9342701722574795, + "grad_norm": 0.2790463365079401, + "learning_rate": 3.4868387693071154e-05, + "loss": 0.891, + "step": 6473 + }, + { + "epoch": 2.9347234814143244, + "grad_norm": 0.2799173073654266, + "learning_rate": 3.485583377438543e-05, + "loss": 0.8867, + "step": 6474 + }, + { + "epoch": 2.9351767905711696, + "grad_norm": 0.2685514628955899, + "learning_rate": 3.4843280370903074e-05, + "loss": 0.8933, + "step": 6475 + }, + { + "epoch": 2.9356300997280145, + "grad_norm": 0.240818857866496, + "learning_rate": 3.483072748388136e-05, + "loss": 0.892, + "step": 6476 + }, + { + "epoch": 2.9360834088848593, + "grad_norm": 0.2968604746793986, + "learning_rate": 3.481817511457746e-05, + "loss": 0.8746, + "step": 6477 + }, + { + "epoch": 2.9365367180417046, + "grad_norm": 0.27533974327061195, + "learning_rate": 3.480562326424855e-05, + "loss": 0.8978, + "step": 6478 + }, + { + "epoch": 2.9369900271985494, + "grad_norm": 0.2703441846893393, + "learning_rate": 3.479307193415175e-05, + "loss": 0.8903, + "step": 6479 + }, + { + "epoch": 2.9374433363553942, + "grad_norm": 0.2905876235447651, + "learning_rate": 3.478052112554409e-05, + "loss": 0.8821, + "step": 6480 + }, + { + "epoch": 2.9378966455122395, + "grad_norm": 0.20368182257882275, + "learning_rate": 3.476797083968258e-05, + "loss": 0.8773, + "step": 6481 + }, + { + "epoch": 2.9383499546690843, + "grad_norm": 0.29152279467646164, + "learning_rate": 3.475542107782417e-05, + "loss": 0.9051, + "step": 6482 + }, + { + "epoch": 2.938803263825929, + "grad_norm": 0.31517234450353043, + "learning_rate": 3.474287184122575e-05, + "loss": 0.8772, + "step": 6483 + }, + { + "epoch": 2.9392565729827744, + "grad_norm": 0.27541391871415555, + "learning_rate": 3.473032313114416e-05, + "loss": 0.8737, + "step": 6484 + }, + { + "epoch": 2.9397098821396193, + "grad_norm": 0.2531834674404645, + "learning_rate": 3.47177749488362e-05, + "loss": 0.8798, + "step": 6485 + }, + { + "epoch": 2.940163191296464, + "grad_norm": 0.20629099175195328, + "learning_rate": 3.47052272955586e-05, + "loss": 0.8739, + "step": 6486 + }, + { + "epoch": 2.9406165004533094, + "grad_norm": 0.23621884170497576, + "learning_rate": 3.469268017256807e-05, + "loss": 0.8809, + "step": 6487 + }, + { + "epoch": 2.941069809610154, + "grad_norm": 0.28052615100694767, + "learning_rate": 3.4680133581121194e-05, + "loss": 0.8862, + "step": 6488 + }, + { + "epoch": 2.941523118766999, + "grad_norm": 0.26712683348639177, + "learning_rate": 3.4667587522474585e-05, + "loss": 0.8759, + "step": 6489 + }, + { + "epoch": 2.9419764279238443, + "grad_norm": 0.2690903217708349, + "learning_rate": 3.4655041997884756e-05, + "loss": 0.887, + "step": 6490 + }, + { + "epoch": 2.942429737080689, + "grad_norm": 0.29636199422420134, + "learning_rate": 3.4642497008608177e-05, + "loss": 0.8911, + "step": 6491 + }, + { + "epoch": 2.942883046237534, + "grad_norm": 0.271655945381079, + "learning_rate": 3.462995255590128e-05, + "loss": 0.8878, + "step": 6492 + }, + { + "epoch": 2.943336355394379, + "grad_norm": 0.20230405490862358, + "learning_rate": 3.46174086410204e-05, + "loss": 0.8906, + "step": 6493 + }, + { + "epoch": 2.943789664551224, + "grad_norm": 0.2596677712045442, + "learning_rate": 3.460486526522187e-05, + "loss": 0.8887, + "step": 6494 + }, + { + "epoch": 2.944242973708069, + "grad_norm": 0.26241341680567376, + "learning_rate": 3.4592322429761937e-05, + "loss": 0.8818, + "step": 6495 + }, + { + "epoch": 2.944696282864914, + "grad_norm": 0.23887953601441644, + "learning_rate": 3.45797801358968e-05, + "loss": 0.8694, + "step": 6496 + }, + { + "epoch": 2.945149592021759, + "grad_norm": 0.25036054829158944, + "learning_rate": 3.4567238384882624e-05, + "loss": 0.9025, + "step": 6497 + }, + { + "epoch": 2.945602901178604, + "grad_norm": 0.2689774238592585, + "learning_rate": 3.455469717797549e-05, + "loss": 0.8672, + "step": 6498 + }, + { + "epoch": 2.9460562103354486, + "grad_norm": 0.26456926098979167, + "learning_rate": 3.454215651643143e-05, + "loss": 0.8918, + "step": 6499 + }, + { + "epoch": 2.946509519492294, + "grad_norm": 0.26869864909425284, + "learning_rate": 3.452961640150643e-05, + "loss": 0.8728, + "step": 6500 + }, + { + "epoch": 2.9469628286491387, + "grad_norm": 0.24173958923693353, + "learning_rate": 3.4517076834456435e-05, + "loss": 0.8751, + "step": 6501 + }, + { + "epoch": 2.9474161378059835, + "grad_norm": 0.3027534460448017, + "learning_rate": 3.450453781653731e-05, + "loss": 0.8793, + "step": 6502 + }, + { + "epoch": 2.9478694469628284, + "grad_norm": 0.34974462548645985, + "learning_rate": 3.4491999349004874e-05, + "loss": 0.8825, + "step": 6503 + }, + { + "epoch": 2.9483227561196736, + "grad_norm": 0.28629564413301806, + "learning_rate": 3.447946143311488e-05, + "loss": 0.8648, + "step": 6504 + }, + { + "epoch": 2.9487760652765185, + "grad_norm": 0.2412523212040073, + "learning_rate": 3.446692407012306e-05, + "loss": 0.8811, + "step": 6505 + }, + { + "epoch": 2.9492293744333633, + "grad_norm": 0.26056106539267954, + "learning_rate": 3.445438726128505e-05, + "loss": 0.8682, + "step": 6506 + }, + { + "epoch": 2.9496826835902086, + "grad_norm": 0.2516991668039863, + "learning_rate": 3.444185100785645e-05, + "loss": 0.8709, + "step": 6507 + }, + { + "epoch": 2.9501359927470534, + "grad_norm": 0.21404289443480706, + "learning_rate": 3.442931531109281e-05, + "loss": 0.8686, + "step": 6508 + }, + { + "epoch": 2.9505893019038982, + "grad_norm": 0.25161822731129607, + "learning_rate": 3.4416780172249636e-05, + "loss": 0.9019, + "step": 6509 + }, + { + "epoch": 2.9510426110607435, + "grad_norm": 0.32408478445471306, + "learning_rate": 3.440424559258231e-05, + "loss": 0.8863, + "step": 6510 + }, + { + "epoch": 2.9514959202175883, + "grad_norm": 0.3261910884655477, + "learning_rate": 3.4391711573346236e-05, + "loss": 0.8805, + "step": 6511 + }, + { + "epoch": 2.951949229374433, + "grad_norm": 0.21609934163344785, + "learning_rate": 3.437917811579673e-05, + "loss": 0.8919, + "step": 6512 + }, + { + "epoch": 2.9524025385312784, + "grad_norm": 0.26702812801736076, + "learning_rate": 3.436664522118906e-05, + "loss": 0.877, + "step": 6513 + }, + { + "epoch": 2.9528558476881233, + "grad_norm": 0.35241623363690733, + "learning_rate": 3.435411289077843e-05, + "loss": 0.8815, + "step": 6514 + }, + { + "epoch": 2.953309156844968, + "grad_norm": 0.27234376495155926, + "learning_rate": 3.434158112581998e-05, + "loss": 0.8667, + "step": 6515 + }, + { + "epoch": 2.9537624660018134, + "grad_norm": 0.23098884976644277, + "learning_rate": 3.432904992756881e-05, + "loss": 0.8883, + "step": 6516 + }, + { + "epoch": 2.954215775158658, + "grad_norm": 0.29856515092207025, + "learning_rate": 3.4316519297279956e-05, + "loss": 0.8962, + "step": 6517 + }, + { + "epoch": 2.954669084315503, + "grad_norm": 0.28734792950495636, + "learning_rate": 3.430398923620841e-05, + "loss": 0.8562, + "step": 6518 + }, + { + "epoch": 2.9551223934723483, + "grad_norm": 0.24999850410665997, + "learning_rate": 3.4291459745609076e-05, + "loss": 0.8808, + "step": 6519 + }, + { + "epoch": 2.955575702629193, + "grad_norm": 0.2363092762507418, + "learning_rate": 3.4278930826736815e-05, + "loss": 0.8638, + "step": 6520 + }, + { + "epoch": 2.956029011786038, + "grad_norm": 0.2733818443109773, + "learning_rate": 3.4266402480846455e-05, + "loss": 0.8871, + "step": 6521 + }, + { + "epoch": 2.956482320942883, + "grad_norm": 0.24492814392639028, + "learning_rate": 3.425387470919273e-05, + "loss": 0.8815, + "step": 6522 + }, + { + "epoch": 2.956935630099728, + "grad_norm": 0.20925473711238948, + "learning_rate": 3.424134751303035e-05, + "loss": 0.8786, + "step": 6523 + }, + { + "epoch": 2.957388939256573, + "grad_norm": 0.239788972496802, + "learning_rate": 3.422882089361394e-05, + "loss": 0.8927, + "step": 6524 + }, + { + "epoch": 2.957842248413418, + "grad_norm": 0.25580484875673054, + "learning_rate": 3.421629485219807e-05, + "loss": 0.8833, + "step": 6525 + }, + { + "epoch": 2.958295557570263, + "grad_norm": 0.2742829350155543, + "learning_rate": 3.4203769390037274e-05, + "loss": 0.8759, + "step": 6526 + }, + { + "epoch": 2.958748866727108, + "grad_norm": 0.2844468762774366, + "learning_rate": 3.4191244508386e-05, + "loss": 0.8774, + "step": 6527 + }, + { + "epoch": 2.959202175883953, + "grad_norm": 0.32306820088030347, + "learning_rate": 3.417872020849869e-05, + "loss": 0.8756, + "step": 6528 + }, + { + "epoch": 2.959655485040798, + "grad_norm": 0.37004166665614363, + "learning_rate": 3.416619649162964e-05, + "loss": 0.9089, + "step": 6529 + }, + { + "epoch": 2.9601087941976427, + "grad_norm": 0.38340134459664443, + "learning_rate": 3.415367335903315e-05, + "loss": 0.8778, + "step": 6530 + }, + { + "epoch": 2.960562103354488, + "grad_norm": 0.34011606112963383, + "learning_rate": 3.414115081196346e-05, + "loss": 0.8877, + "step": 6531 + }, + { + "epoch": 2.961015412511333, + "grad_norm": 0.25132368168406033, + "learning_rate": 3.4128628851674736e-05, + "loss": 0.8935, + "step": 6532 + }, + { + "epoch": 2.9614687216681777, + "grad_norm": 0.2606057998561131, + "learning_rate": 3.411610747942109e-05, + "loss": 0.8787, + "step": 6533 + }, + { + "epoch": 2.961922030825023, + "grad_norm": 0.28974582600979065, + "learning_rate": 3.410358669645657e-05, + "loss": 0.8822, + "step": 6534 + }, + { + "epoch": 2.9623753399818678, + "grad_norm": 0.33504998361819505, + "learning_rate": 3.409106650403517e-05, + "loss": 0.8657, + "step": 6535 + }, + { + "epoch": 2.9628286491387126, + "grad_norm": 0.39895892683610473, + "learning_rate": 3.4078546903410825e-05, + "loss": 0.8761, + "step": 6536 + }, + { + "epoch": 2.9632819582955574, + "grad_norm": 0.43155543521112216, + "learning_rate": 3.406602789583741e-05, + "loss": 0.8759, + "step": 6537 + }, + { + "epoch": 2.9637352674524027, + "grad_norm": 0.39501896075336135, + "learning_rate": 3.4053509482568744e-05, + "loss": 0.8727, + "step": 6538 + }, + { + "epoch": 2.9641885766092475, + "grad_norm": 0.27884842836099927, + "learning_rate": 3.404099166485858e-05, + "loss": 0.8827, + "step": 6539 + }, + { + "epoch": 2.9646418857660923, + "grad_norm": 0.3132673438579099, + "learning_rate": 3.4028474443960613e-05, + "loss": 0.8823, + "step": 6540 + }, + { + "epoch": 2.9650951949229376, + "grad_norm": 0.2972338850907511, + "learning_rate": 3.4015957821128474e-05, + "loss": 0.8561, + "step": 6541 + }, + { + "epoch": 2.9655485040797824, + "grad_norm": 0.2781815711655358, + "learning_rate": 3.400344179761575e-05, + "loss": 0.8673, + "step": 6542 + }, + { + "epoch": 2.9660018132366273, + "grad_norm": 0.31033717103892894, + "learning_rate": 3.3990926374675955e-05, + "loss": 0.8687, + "step": 6543 + }, + { + "epoch": 2.966455122393472, + "grad_norm": 0.3705863234535467, + "learning_rate": 3.3978411553562557e-05, + "loss": 0.8845, + "step": 6544 + }, + { + "epoch": 2.9669084315503174, + "grad_norm": 0.32207809071345045, + "learning_rate": 3.396589733552892e-05, + "loss": 0.8718, + "step": 6545 + }, + { + "epoch": 2.967361740707162, + "grad_norm": 0.26876780470532624, + "learning_rate": 3.395338372182841e-05, + "loss": 0.8888, + "step": 6546 + }, + { + "epoch": 2.967815049864007, + "grad_norm": 0.3293226607790775, + "learning_rate": 3.3940870713714295e-05, + "loss": 0.883, + "step": 6547 + }, + { + "epoch": 2.9682683590208523, + "grad_norm": 0.26606550297267023, + "learning_rate": 3.392835831243978e-05, + "loss": 0.8819, + "step": 6548 + }, + { + "epoch": 2.968721668177697, + "grad_norm": 0.2567413064325023, + "learning_rate": 3.391584651925802e-05, + "loss": 0.886, + "step": 6549 + }, + { + "epoch": 2.969174977334542, + "grad_norm": 0.3013982991145944, + "learning_rate": 3.390333533542211e-05, + "loss": 0.8808, + "step": 6550 + }, + { + "epoch": 2.969628286491387, + "grad_norm": 0.2985465467814195, + "learning_rate": 3.389082476218509e-05, + "loss": 0.8788, + "step": 6551 + }, + { + "epoch": 2.970081595648232, + "grad_norm": 0.2720962421111112, + "learning_rate": 3.387831480079992e-05, + "loss": 0.8899, + "step": 6552 + }, + { + "epoch": 2.970534904805077, + "grad_norm": 0.2831213326737512, + "learning_rate": 3.3865805452519516e-05, + "loss": 0.8948, + "step": 6553 + }, + { + "epoch": 2.970988213961922, + "grad_norm": 0.2184731203603584, + "learning_rate": 3.385329671859672e-05, + "loss": 0.8803, + "step": 6554 + }, + { + "epoch": 2.971441523118767, + "grad_norm": 0.22405475348287116, + "learning_rate": 3.3840788600284325e-05, + "loss": 0.9051, + "step": 6555 + }, + { + "epoch": 2.971894832275612, + "grad_norm": 0.2619120595248383, + "learning_rate": 3.3828281098835054e-05, + "loss": 0.8965, + "step": 6556 + }, + { + "epoch": 2.972348141432457, + "grad_norm": 0.2680598995949106, + "learning_rate": 3.381577421550157e-05, + "loss": 0.8673, + "step": 6557 + }, + { + "epoch": 2.972801450589302, + "grad_norm": 0.28575751760087786, + "learning_rate": 3.380326795153647e-05, + "loss": 0.8809, + "step": 6558 + }, + { + "epoch": 2.9732547597461467, + "grad_norm": 0.30043629931103066, + "learning_rate": 3.3790762308192305e-05, + "loss": 0.9112, + "step": 6559 + }, + { + "epoch": 2.973708068902992, + "grad_norm": 0.24689134245629102, + "learning_rate": 3.377825728672154e-05, + "loss": 0.8647, + "step": 6560 + }, + { + "epoch": 2.974161378059837, + "grad_norm": 0.2842602548256748, + "learning_rate": 3.376575288837659e-05, + "loss": 0.886, + "step": 6561 + }, + { + "epoch": 2.9746146872166817, + "grad_norm": 0.22269196744245537, + "learning_rate": 3.3753249114409805e-05, + "loss": 0.8693, + "step": 6562 + }, + { + "epoch": 2.975067996373527, + "grad_norm": 0.2040247720223917, + "learning_rate": 3.374074596607349e-05, + "loss": 0.872, + "step": 6563 + }, + { + "epoch": 2.9755213055303718, + "grad_norm": 0.26650214569408814, + "learning_rate": 3.372824344461986e-05, + "loss": 0.8739, + "step": 6564 + }, + { + "epoch": 2.9759746146872166, + "grad_norm": 0.2361811317243839, + "learning_rate": 3.3715741551301076e-05, + "loss": 0.8854, + "step": 6565 + }, + { + "epoch": 2.976427923844062, + "grad_norm": 0.2214742952622839, + "learning_rate": 3.3703240287369265e-05, + "loss": 0.8799, + "step": 6566 + }, + { + "epoch": 2.9768812330009067, + "grad_norm": 0.24650868104336557, + "learning_rate": 3.369073965407643e-05, + "loss": 0.8999, + "step": 6567 + }, + { + "epoch": 2.9773345421577515, + "grad_norm": 0.2274664842890826, + "learning_rate": 3.3678239652674555e-05, + "loss": 0.8813, + "step": 6568 + }, + { + "epoch": 2.977787851314597, + "grad_norm": 0.2283322614096848, + "learning_rate": 3.3665740284415566e-05, + "loss": 0.899, + "step": 6569 + }, + { + "epoch": 2.9782411604714416, + "grad_norm": 0.28137709030461067, + "learning_rate": 3.3653241550551294e-05, + "loss": 0.8861, + "step": 6570 + }, + { + "epoch": 2.9786944696282864, + "grad_norm": 0.2844834054619371, + "learning_rate": 3.364074345233354e-05, + "loss": 0.8886, + "step": 6571 + }, + { + "epoch": 2.9791477787851317, + "grad_norm": 0.24658422758424312, + "learning_rate": 3.362824599101402e-05, + "loss": 0.8918, + "step": 6572 + }, + { + "epoch": 2.9796010879419765, + "grad_norm": 0.34390647691604737, + "learning_rate": 3.361574916784439e-05, + "loss": 0.8741, + "step": 6573 + }, + { + "epoch": 2.9800543970988214, + "grad_norm": 0.4843914824510498, + "learning_rate": 3.3603252984076243e-05, + "loss": 0.8859, + "step": 6574 + }, + { + "epoch": 2.9805077062556666, + "grad_norm": 0.5268503570301065, + "learning_rate": 3.3590757440961117e-05, + "loss": 0.8653, + "step": 6575 + }, + { + "epoch": 2.9809610154125115, + "grad_norm": 0.48638751103940214, + "learning_rate": 3.357826253975046e-05, + "loss": 0.8609, + "step": 6576 + }, + { + "epoch": 2.9814143245693563, + "grad_norm": 0.38567778484926746, + "learning_rate": 3.356576828169568e-05, + "loss": 0.8887, + "step": 6577 + }, + { + "epoch": 2.981867633726201, + "grad_norm": 0.306114188735612, + "learning_rate": 3.355327466804812e-05, + "loss": 0.8888, + "step": 6578 + }, + { + "epoch": 2.9823209428830464, + "grad_norm": 0.4064204534513284, + "learning_rate": 3.3540781700059054e-05, + "loss": 0.8797, + "step": 6579 + }, + { + "epoch": 2.982774252039891, + "grad_norm": 0.38529465769321747, + "learning_rate": 3.352828937897967e-05, + "loss": 0.8749, + "step": 6580 + }, + { + "epoch": 2.983227561196736, + "grad_norm": 0.41667025980963945, + "learning_rate": 3.351579770606113e-05, + "loss": 0.895, + "step": 6581 + }, + { + "epoch": 2.983680870353581, + "grad_norm": 0.4340750143629695, + "learning_rate": 3.3503306682554515e-05, + "loss": 0.8806, + "step": 6582 + }, + { + "epoch": 2.984134179510426, + "grad_norm": 0.40486483279650326, + "learning_rate": 3.3490816309710826e-05, + "loss": 0.878, + "step": 6583 + }, + { + "epoch": 2.984587488667271, + "grad_norm": 0.28478419072545974, + "learning_rate": 3.347832658878101e-05, + "loss": 0.8788, + "step": 6584 + }, + { + "epoch": 2.985040797824116, + "grad_norm": 0.29235633460386085, + "learning_rate": 3.346583752101597e-05, + "loss": 0.8687, + "step": 6585 + }, + { + "epoch": 2.985494106980961, + "grad_norm": 0.37688574759109894, + "learning_rate": 3.3453349107666495e-05, + "loss": 0.885, + "step": 6586 + }, + { + "epoch": 2.985947416137806, + "grad_norm": 0.3442058285392243, + "learning_rate": 3.344086134998334e-05, + "loss": 0.87, + "step": 6587 + }, + { + "epoch": 2.9864007252946507, + "grad_norm": 0.3429015063797836, + "learning_rate": 3.34283742492172e-05, + "loss": 0.9086, + "step": 6588 + }, + { + "epoch": 2.986854034451496, + "grad_norm": 0.4755717216302537, + "learning_rate": 3.341588780661869e-05, + "loss": 0.8764, + "step": 6589 + }, + { + "epoch": 2.987307343608341, + "grad_norm": 0.47421943339945194, + "learning_rate": 3.340340202343838e-05, + "loss": 0.911, + "step": 6590 + }, + { + "epoch": 2.9877606527651857, + "grad_norm": 0.3938441639819891, + "learning_rate": 3.3390916900926736e-05, + "loss": 0.8863, + "step": 6591 + }, + { + "epoch": 2.988213961922031, + "grad_norm": 0.26462295061963054, + "learning_rate": 3.337843244033419e-05, + "loss": 0.8953, + "step": 6592 + }, + { + "epoch": 2.9886672710788758, + "grad_norm": 0.33992230867075274, + "learning_rate": 3.336594864291109e-05, + "loss": 0.8831, + "step": 6593 + }, + { + "epoch": 2.9891205802357206, + "grad_norm": 0.3993311744244858, + "learning_rate": 3.335346550990773e-05, + "loss": 0.9166, + "step": 6594 + }, + { + "epoch": 2.989573889392566, + "grad_norm": 0.41821670178330056, + "learning_rate": 3.334098304257434e-05, + "loss": 0.8605, + "step": 6595 + }, + { + "epoch": 2.9900271985494107, + "grad_norm": 0.5476385397592922, + "learning_rate": 3.3328501242161055e-05, + "loss": 0.8935, + "step": 6596 + }, + { + "epoch": 2.9904805077062555, + "grad_norm": 0.26235229265599325, + "learning_rate": 3.331602010991799e-05, + "loss": 0.865, + "step": 6597 + }, + { + "epoch": 2.990933816863101, + "grad_norm": 0.32902572402215463, + "learning_rate": 3.3303539647095154e-05, + "loss": 0.8989, + "step": 6598 + }, + { + "epoch": 2.9913871260199456, + "grad_norm": 0.37421502648932015, + "learning_rate": 3.32910598549425e-05, + "loss": 0.8689, + "step": 6599 + }, + { + "epoch": 2.9918404351767904, + "grad_norm": 0.3306939206065631, + "learning_rate": 3.327858073470993e-05, + "loss": 0.8807, + "step": 6600 + }, + { + "epoch": 2.9922937443336357, + "grad_norm": 0.23265372020179048, + "learning_rate": 3.326610228764724e-05, + "loss": 0.8588, + "step": 6601 + }, + { + "epoch": 2.9927470534904805, + "grad_norm": 0.19740322160704557, + "learning_rate": 3.32536245150042e-05, + "loss": 0.8764, + "step": 6602 + }, + { + "epoch": 2.9932003626473254, + "grad_norm": 0.2117471870519739, + "learning_rate": 3.324114741803049e-05, + "loss": 0.893, + "step": 6603 + }, + { + "epoch": 2.9936536718041706, + "grad_norm": 0.26676658897211664, + "learning_rate": 3.322867099797573e-05, + "loss": 0.905, + "step": 6604 + }, + { + "epoch": 2.9941069809610155, + "grad_norm": 0.311638834321624, + "learning_rate": 3.321619525608949e-05, + "loss": 0.8859, + "step": 6605 + }, + { + "epoch": 2.9945602901178603, + "grad_norm": 0.3602172717595824, + "learning_rate": 3.320372019362121e-05, + "loss": 0.8652, + "step": 6606 + }, + { + "epoch": 2.9950135992747056, + "grad_norm": 0.331745667636148, + "learning_rate": 3.319124581182033e-05, + "loss": 0.8769, + "step": 6607 + }, + { + "epoch": 2.9954669084315504, + "grad_norm": 0.31156131832821554, + "learning_rate": 3.3178772111936195e-05, + "loss": 0.8718, + "step": 6608 + }, + { + "epoch": 2.995920217588395, + "grad_norm": 0.2769527648094418, + "learning_rate": 3.3166299095218076e-05, + "loss": 0.8816, + "step": 6609 + }, + { + "epoch": 2.9963735267452405, + "grad_norm": 0.3254666184428981, + "learning_rate": 3.315382676291519e-05, + "loss": 0.8873, + "step": 6610 + }, + { + "epoch": 2.9968268359020853, + "grad_norm": 0.3543448268156201, + "learning_rate": 3.314135511627667e-05, + "loss": 0.8815, + "step": 6611 + }, + { + "epoch": 2.99728014505893, + "grad_norm": 0.3698237809187057, + "learning_rate": 3.3128884156551594e-05, + "loss": 0.8838, + "step": 6612 + }, + { + "epoch": 2.9977334542157754, + "grad_norm": 0.3255710803683088, + "learning_rate": 3.3116413884988964e-05, + "loss": 0.903, + "step": 6613 + }, + { + "epoch": 2.9981867633726202, + "grad_norm": 0.33446641609932964, + "learning_rate": 3.310394430283772e-05, + "loss": 0.8765, + "step": 6614 + }, + { + "epoch": 2.998640072529465, + "grad_norm": 0.3331726930293665, + "learning_rate": 3.309147541134671e-05, + "loss": 0.8765, + "step": 6615 + }, + { + "epoch": 2.99909338168631, + "grad_norm": 0.24003905212895507, + "learning_rate": 3.3079007211764754e-05, + "loss": 0.8827, + "step": 6616 + }, + { + "epoch": 2.999546690843155, + "grad_norm": 0.31741214913625665, + "learning_rate": 3.306653970534056e-05, + "loss": 0.8832, + "step": 6617 + }, + { + "epoch": 3.0, + "grad_norm": 0.36119418274495274, + "learning_rate": 3.305407289332279e-05, + "loss": 0.8865, + "step": 6618 + }, + { + "epoch": 3.000453309156845, + "grad_norm": 0.3427513870582783, + "learning_rate": 3.3041606776960035e-05, + "loss": 0.8588, + "step": 6619 + }, + { + "epoch": 3.00090661831369, + "grad_norm": 0.29687491490764717, + "learning_rate": 3.302914135750081e-05, + "loss": 0.894, + "step": 6620 + }, + { + "epoch": 3.001359927470535, + "grad_norm": 0.2910542471668288, + "learning_rate": 3.3016676636193565e-05, + "loss": 0.8575, + "step": 6621 + }, + { + "epoch": 3.0018132366273798, + "grad_norm": 0.33723523466260036, + "learning_rate": 3.300421261428668e-05, + "loss": 0.8653, + "step": 6622 + }, + { + "epoch": 3.002266545784225, + "grad_norm": 0.30760435236582284, + "learning_rate": 3.299174929302846e-05, + "loss": 0.8686, + "step": 6623 + }, + { + "epoch": 3.00271985494107, + "grad_norm": 0.29241794836477125, + "learning_rate": 3.297928667366716e-05, + "loss": 0.8751, + "step": 6624 + }, + { + "epoch": 3.0031731640979147, + "grad_norm": 0.3413778681467983, + "learning_rate": 3.296682475745092e-05, + "loss": 0.8721, + "step": 6625 + }, + { + "epoch": 3.00362647325476, + "grad_norm": 0.27871702255072955, + "learning_rate": 3.295436354562785e-05, + "loss": 0.8724, + "step": 6626 + }, + { + "epoch": 3.004079782411605, + "grad_norm": 0.246302658681288, + "learning_rate": 3.2941903039445984e-05, + "loss": 0.8602, + "step": 6627 + }, + { + "epoch": 3.0045330915684496, + "grad_norm": 0.23813197473731113, + "learning_rate": 3.292944324015326e-05, + "loss": 0.8807, + "step": 6628 + }, + { + "epoch": 3.0049864007252944, + "grad_norm": 0.26018928273904224, + "learning_rate": 3.291698414899758e-05, + "loss": 0.8661, + "step": 6629 + }, + { + "epoch": 3.0054397098821397, + "grad_norm": 0.25997164301283227, + "learning_rate": 3.2904525767226755e-05, + "loss": 0.8658, + "step": 6630 + }, + { + "epoch": 3.0058930190389845, + "grad_norm": 0.26304228731733087, + "learning_rate": 3.289206809608854e-05, + "loss": 0.8822, + "step": 6631 + }, + { + "epoch": 3.0063463281958294, + "grad_norm": 0.24866880130486704, + "learning_rate": 3.287961113683058e-05, + "loss": 0.8749, + "step": 6632 + }, + { + "epoch": 3.0067996373526746, + "grad_norm": 0.23610086775712671, + "learning_rate": 3.2867154890700494e-05, + "loss": 0.8515, + "step": 6633 + }, + { + "epoch": 3.0072529465095195, + "grad_norm": 0.20954519677937103, + "learning_rate": 3.285469935894581e-05, + "loss": 0.8682, + "step": 6634 + }, + { + "epoch": 3.0077062556663643, + "grad_norm": 0.21447477148578786, + "learning_rate": 3.284224454281398e-05, + "loss": 0.8701, + "step": 6635 + }, + { + "epoch": 3.0081595648232096, + "grad_norm": 0.2189035293562137, + "learning_rate": 3.2829790443552396e-05, + "loss": 0.874, + "step": 6636 + }, + { + "epoch": 3.0086128739800544, + "grad_norm": 0.18208221363822405, + "learning_rate": 3.2817337062408374e-05, + "loss": 0.8823, + "step": 6637 + }, + { + "epoch": 3.009066183136899, + "grad_norm": 0.22347231324639383, + "learning_rate": 3.280488440062916e-05, + "loss": 0.8545, + "step": 6638 + }, + { + "epoch": 3.0095194922937445, + "grad_norm": 0.25593847365872185, + "learning_rate": 3.279243245946191e-05, + "loss": 0.8533, + "step": 6639 + }, + { + "epoch": 3.0099728014505893, + "grad_norm": 0.2350071599815082, + "learning_rate": 3.277998124015374e-05, + "loss": 0.8598, + "step": 6640 + }, + { + "epoch": 3.010426110607434, + "grad_norm": 0.29604040450071, + "learning_rate": 3.276753074395166e-05, + "loss": 0.8535, + "step": 6641 + }, + { + "epoch": 3.0108794197642794, + "grad_norm": 0.25949700824492383, + "learning_rate": 3.275508097210265e-05, + "loss": 0.8766, + "step": 6642 + }, + { + "epoch": 3.0113327289211242, + "grad_norm": 0.23165809651591504, + "learning_rate": 3.274263192585357e-05, + "loss": 0.8726, + "step": 6643 + }, + { + "epoch": 3.011786038077969, + "grad_norm": 0.45595200440790534, + "learning_rate": 3.273018360645122e-05, + "loss": 0.8584, + "step": 6644 + }, + { + "epoch": 3.0122393472348143, + "grad_norm": 0.2404210575738737, + "learning_rate": 3.2717736015142346e-05, + "loss": 0.8682, + "step": 6645 + }, + { + "epoch": 3.012692656391659, + "grad_norm": 0.2891761684863227, + "learning_rate": 3.270528915317362e-05, + "loss": 0.8722, + "step": 6646 + }, + { + "epoch": 3.013145965548504, + "grad_norm": 0.3510353268291218, + "learning_rate": 3.269284302179162e-05, + "loss": 0.8564, + "step": 6647 + }, + { + "epoch": 3.013599274705349, + "grad_norm": 0.3773347017293069, + "learning_rate": 3.268039762224286e-05, + "loss": 0.8649, + "step": 6648 + }, + { + "epoch": 3.014052583862194, + "grad_norm": 0.37446029936552694, + "learning_rate": 3.2667952955773805e-05, + "loss": 0.8842, + "step": 6649 + }, + { + "epoch": 3.014505893019039, + "grad_norm": 0.3422048707079327, + "learning_rate": 3.26555090236308e-05, + "loss": 0.8744, + "step": 6650 + }, + { + "epoch": 3.0149592021758838, + "grad_norm": 0.33035208888546347, + "learning_rate": 3.2643065827060144e-05, + "loss": 0.871, + "step": 6651 + }, + { + "epoch": 3.015412511332729, + "grad_norm": 0.30543325203782223, + "learning_rate": 3.2630623367308074e-05, + "loss": 0.8707, + "step": 6652 + }, + { + "epoch": 3.015865820489574, + "grad_norm": 0.35531049925181185, + "learning_rate": 3.261818164562074e-05, + "loss": 0.8675, + "step": 6653 + }, + { + "epoch": 3.0163191296464187, + "grad_norm": 0.3588206226857358, + "learning_rate": 3.26057406632442e-05, + "loss": 0.8662, + "step": 6654 + }, + { + "epoch": 3.016772438803264, + "grad_norm": 0.2860124228921052, + "learning_rate": 3.259330042142446e-05, + "loss": 0.8675, + "step": 6655 + }, + { + "epoch": 3.017225747960109, + "grad_norm": 0.36286269197688004, + "learning_rate": 3.258086092140746e-05, + "loss": 0.8778, + "step": 6656 + }, + { + "epoch": 3.0176790571169536, + "grad_norm": 0.2673797230391106, + "learning_rate": 3.2568422164439044e-05, + "loss": 0.8572, + "step": 6657 + }, + { + "epoch": 3.018132366273799, + "grad_norm": 0.24603533299109567, + "learning_rate": 3.255598415176499e-05, + "loss": 0.856, + "step": 6658 + }, + { + "epoch": 3.0185856754306437, + "grad_norm": 0.5526211652638051, + "learning_rate": 3.2543546884630995e-05, + "loss": 0.8727, + "step": 6659 + }, + { + "epoch": 3.0190389845874885, + "grad_norm": 0.23766965005764204, + "learning_rate": 3.253111036428269e-05, + "loss": 0.8669, + "step": 6660 + }, + { + "epoch": 3.019492293744334, + "grad_norm": 0.25472088905346957, + "learning_rate": 3.251867459196564e-05, + "loss": 0.8909, + "step": 6661 + }, + { + "epoch": 3.0199456029011786, + "grad_norm": 0.22818541608560458, + "learning_rate": 3.250623956892533e-05, + "loss": 0.86, + "step": 6662 + }, + { + "epoch": 3.0203989120580235, + "grad_norm": 0.24245283290839337, + "learning_rate": 3.2493805296407136e-05, + "loss": 0.8735, + "step": 6663 + }, + { + "epoch": 3.0208522212148687, + "grad_norm": 0.3170746078602302, + "learning_rate": 3.24813717756564e-05, + "loss": 0.8822, + "step": 6664 + }, + { + "epoch": 3.0213055303717136, + "grad_norm": 0.3657375160422951, + "learning_rate": 3.246893900791838e-05, + "loss": 0.8879, + "step": 6665 + }, + { + "epoch": 3.0217588395285584, + "grad_norm": 0.33528572027073683, + "learning_rate": 3.245650699443824e-05, + "loss": 0.8632, + "step": 6666 + }, + { + "epoch": 3.0222121486854037, + "grad_norm": 0.3272138373838642, + "learning_rate": 3.244407573646111e-05, + "loss": 0.8804, + "step": 6667 + }, + { + "epoch": 3.0226654578422485, + "grad_norm": 0.24330116129969015, + "learning_rate": 3.243164523523199e-05, + "loss": 0.8703, + "step": 6668 + }, + { + "epoch": 3.0231187669990933, + "grad_norm": 0.27347677206754123, + "learning_rate": 3.241921549199585e-05, + "loss": 0.8658, + "step": 6669 + }, + { + "epoch": 3.023572076155938, + "grad_norm": 0.32387095372015273, + "learning_rate": 3.240678650799756e-05, + "loss": 0.8773, + "step": 6670 + }, + { + "epoch": 3.0240253853127834, + "grad_norm": 0.3012960431432415, + "learning_rate": 3.239435828448191e-05, + "loss": 0.8662, + "step": 6671 + }, + { + "epoch": 3.0244786944696282, + "grad_norm": 0.23935595716596855, + "learning_rate": 3.238193082269365e-05, + "loss": 0.8504, + "step": 6672 + }, + { + "epoch": 3.024932003626473, + "grad_norm": 0.309670880910683, + "learning_rate": 3.236950412387739e-05, + "loss": 0.8757, + "step": 6673 + }, + { + "epoch": 3.0253853127833183, + "grad_norm": 0.26966552464401855, + "learning_rate": 3.2357078189277713e-05, + "loss": 0.8807, + "step": 6674 + }, + { + "epoch": 3.025838621940163, + "grad_norm": 0.2949868879584435, + "learning_rate": 3.2344653020139123e-05, + "loss": 0.8581, + "step": 6675 + }, + { + "epoch": 3.026291931097008, + "grad_norm": 0.3516005204076819, + "learning_rate": 3.233222861770603e-05, + "loss": 0.871, + "step": 6676 + }, + { + "epoch": 3.0267452402538533, + "grad_norm": 0.32385515177853386, + "learning_rate": 3.231980498322278e-05, + "loss": 0.8565, + "step": 6677 + }, + { + "epoch": 3.027198549410698, + "grad_norm": 0.2610150503970392, + "learning_rate": 3.230738211793363e-05, + "loss": 0.8762, + "step": 6678 + }, + { + "epoch": 3.027651858567543, + "grad_norm": 0.20507242790101662, + "learning_rate": 3.229496002308277e-05, + "loss": 0.8693, + "step": 6679 + }, + { + "epoch": 3.028105167724388, + "grad_norm": 0.21078917486858387, + "learning_rate": 3.228253869991431e-05, + "loss": 0.8632, + "step": 6680 + }, + { + "epoch": 3.028558476881233, + "grad_norm": 0.23939708854114447, + "learning_rate": 3.227011814967229e-05, + "loss": 0.8856, + "step": 6681 + }, + { + "epoch": 3.029011786038078, + "grad_norm": 0.24412735734841068, + "learning_rate": 3.225769837360065e-05, + "loss": 0.8495, + "step": 6682 + }, + { + "epoch": 3.029465095194923, + "grad_norm": 0.3100403177985203, + "learning_rate": 3.2245279372943264e-05, + "loss": 0.8612, + "step": 6683 + }, + { + "epoch": 3.029918404351768, + "grad_norm": 0.32796926619982547, + "learning_rate": 3.2232861148943935e-05, + "loss": 0.8721, + "step": 6684 + }, + { + "epoch": 3.030371713508613, + "grad_norm": 0.3258228132759127, + "learning_rate": 3.22204437028464e-05, + "loss": 0.871, + "step": 6685 + }, + { + "epoch": 3.030825022665458, + "grad_norm": 0.2609336337646969, + "learning_rate": 3.220802703589429e-05, + "loss": 0.8603, + "step": 6686 + }, + { + "epoch": 3.031278331822303, + "grad_norm": 0.20663520178881023, + "learning_rate": 3.219561114933116e-05, + "loss": 0.8626, + "step": 6687 + }, + { + "epoch": 3.0317316409791477, + "grad_norm": 0.2514316896768957, + "learning_rate": 3.218319604440053e-05, + "loss": 0.8662, + "step": 6688 + }, + { + "epoch": 3.0321849501359925, + "grad_norm": 0.2687393787786191, + "learning_rate": 3.217078172234578e-05, + "loss": 0.8445, + "step": 6689 + }, + { + "epoch": 3.032638259292838, + "grad_norm": 0.236979786468636, + "learning_rate": 3.2158368184410246e-05, + "loss": 0.8587, + "step": 6690 + }, + { + "epoch": 3.0330915684496826, + "grad_norm": 0.22289273991926387, + "learning_rate": 3.2145955431837194e-05, + "loss": 0.8465, + "step": 6691 + }, + { + "epoch": 3.0335448776065275, + "grad_norm": 0.25154570638005525, + "learning_rate": 3.213354346586978e-05, + "loss": 0.8733, + "step": 6692 + }, + { + "epoch": 3.0339981867633727, + "grad_norm": 0.32574470457920934, + "learning_rate": 3.2121132287751106e-05, + "loss": 0.8682, + "step": 6693 + }, + { + "epoch": 3.0344514959202176, + "grad_norm": 0.3456159802044545, + "learning_rate": 3.2108721898724194e-05, + "loss": 0.8745, + "step": 6694 + }, + { + "epoch": 3.0349048050770624, + "grad_norm": 0.29310988994991205, + "learning_rate": 3.209631230003197e-05, + "loss": 0.849, + "step": 6695 + }, + { + "epoch": 3.0353581142339077, + "grad_norm": 0.32131298180568957, + "learning_rate": 3.2083903492917306e-05, + "loss": 0.8733, + "step": 6696 + }, + { + "epoch": 3.0358114233907525, + "grad_norm": 0.29111748843731894, + "learning_rate": 3.2071495478622966e-05, + "loss": 0.8479, + "step": 6697 + }, + { + "epoch": 3.0362647325475973, + "grad_norm": 0.2485975554403887, + "learning_rate": 3.205908825839166e-05, + "loss": 0.8588, + "step": 6698 + }, + { + "epoch": 3.0367180417044426, + "grad_norm": 0.3200467320799245, + "learning_rate": 3.2046681833466e-05, + "loss": 0.8737, + "step": 6699 + }, + { + "epoch": 3.0371713508612874, + "grad_norm": 0.4373118980735874, + "learning_rate": 3.203427620508853e-05, + "loss": 0.8617, + "step": 6700 + }, + { + "epoch": 3.0376246600181322, + "grad_norm": 0.4699262386278818, + "learning_rate": 3.202187137450171e-05, + "loss": 0.8413, + "step": 6701 + }, + { + "epoch": 3.0380779691749775, + "grad_norm": 0.4086497677175102, + "learning_rate": 3.200946734294792e-05, + "loss": 0.8687, + "step": 6702 + }, + { + "epoch": 3.0385312783318223, + "grad_norm": 0.3530842586933905, + "learning_rate": 3.199706411166946e-05, + "loss": 0.8575, + "step": 6703 + }, + { + "epoch": 3.038984587488667, + "grad_norm": 0.35092766569563255, + "learning_rate": 3.198466168190855e-05, + "loss": 0.8591, + "step": 6704 + }, + { + "epoch": 3.0394378966455124, + "grad_norm": 0.3479597013899658, + "learning_rate": 3.197226005490732e-05, + "loss": 0.8717, + "step": 6705 + }, + { + "epoch": 3.0398912058023573, + "grad_norm": 0.3825536090684345, + "learning_rate": 3.195985923190785e-05, + "loss": 0.8669, + "step": 6706 + }, + { + "epoch": 3.040344514959202, + "grad_norm": 0.33058667987959894, + "learning_rate": 3.1947459214152094e-05, + "loss": 0.8545, + "step": 6707 + }, + { + "epoch": 3.040797824116047, + "grad_norm": 0.2799138561403128, + "learning_rate": 3.193506000288197e-05, + "loss": 0.8537, + "step": 6708 + }, + { + "epoch": 3.041251133272892, + "grad_norm": 0.27158628117619266, + "learning_rate": 3.192266159933929e-05, + "loss": 0.8498, + "step": 6709 + }, + { + "epoch": 3.041704442429737, + "grad_norm": 0.2645282957861585, + "learning_rate": 3.191026400476579e-05, + "loss": 0.8686, + "step": 6710 + }, + { + "epoch": 3.042157751586582, + "grad_norm": 0.34217845020508875, + "learning_rate": 3.189786722040312e-05, + "loss": 0.8688, + "step": 6711 + }, + { + "epoch": 3.042611060743427, + "grad_norm": 0.444871916290934, + "learning_rate": 3.188547124749287e-05, + "loss": 0.8702, + "step": 6712 + }, + { + "epoch": 3.043064369900272, + "grad_norm": 0.4170443628805125, + "learning_rate": 3.187307608727651e-05, + "loss": 0.8742, + "step": 6713 + }, + { + "epoch": 3.043517679057117, + "grad_norm": 0.2713541064464956, + "learning_rate": 3.186068174099548e-05, + "loss": 0.8649, + "step": 6714 + }, + { + "epoch": 3.043970988213962, + "grad_norm": 0.263120752244134, + "learning_rate": 3.184828820989108e-05, + "loss": 0.8662, + "step": 6715 + }, + { + "epoch": 3.044424297370807, + "grad_norm": 0.35081852262487395, + "learning_rate": 3.183589549520458e-05, + "loss": 0.8633, + "step": 6716 + }, + { + "epoch": 3.0448776065276517, + "grad_norm": 0.30588537516623093, + "learning_rate": 3.182350359817714e-05, + "loss": 0.8632, + "step": 6717 + }, + { + "epoch": 3.045330915684497, + "grad_norm": 0.283485184864662, + "learning_rate": 3.181111252004985e-05, + "loss": 0.8995, + "step": 6718 + }, + { + "epoch": 3.045784224841342, + "grad_norm": 0.26999905983242595, + "learning_rate": 3.17987222620637e-05, + "loss": 0.8696, + "step": 6719 + }, + { + "epoch": 3.0462375339981866, + "grad_norm": 0.2507771580144439, + "learning_rate": 3.178633282545963e-05, + "loss": 0.8641, + "step": 6720 + }, + { + "epoch": 3.046690843155032, + "grad_norm": 0.22674899876283372, + "learning_rate": 3.177394421147846e-05, + "loss": 0.8618, + "step": 6721 + }, + { + "epoch": 3.0471441523118767, + "grad_norm": 0.21132055775106098, + "learning_rate": 3.176155642136095e-05, + "loss": 0.8786, + "step": 6722 + }, + { + "epoch": 3.0475974614687216, + "grad_norm": 0.24181677280227717, + "learning_rate": 3.1749169456347775e-05, + "loss": 0.8663, + "step": 6723 + }, + { + "epoch": 3.048050770625567, + "grad_norm": 0.22427388600785875, + "learning_rate": 3.173678331767953e-05, + "loss": 0.8773, + "step": 6724 + }, + { + "epoch": 3.0485040797824117, + "grad_norm": 0.24835904761373784, + "learning_rate": 3.172439800659672e-05, + "loss": 0.8779, + "step": 6725 + }, + { + "epoch": 3.0489573889392565, + "grad_norm": 0.19602302580347095, + "learning_rate": 3.1712013524339774e-05, + "loss": 0.8828, + "step": 6726 + }, + { + "epoch": 3.0494106980961013, + "grad_norm": 0.2518246582184627, + "learning_rate": 3.169962987214903e-05, + "loss": 0.8794, + "step": 6727 + }, + { + "epoch": 3.0498640072529466, + "grad_norm": 0.25193542606093694, + "learning_rate": 3.1687247051264744e-05, + "loss": 0.8579, + "step": 6728 + }, + { + "epoch": 3.0503173164097914, + "grad_norm": 0.20890757433552287, + "learning_rate": 3.167486506292711e-05, + "loss": 0.8609, + "step": 6729 + }, + { + "epoch": 3.0507706255666363, + "grad_norm": 0.21986981776141404, + "learning_rate": 3.1662483908376195e-05, + "loss": 0.8669, + "step": 6730 + }, + { + "epoch": 3.0512239347234815, + "grad_norm": 0.24525741890410738, + "learning_rate": 3.1650103588852025e-05, + "loss": 0.8521, + "step": 6731 + }, + { + "epoch": 3.0516772438803264, + "grad_norm": 0.20798514758050998, + "learning_rate": 3.163772410559451e-05, + "loss": 0.8676, + "step": 6732 + }, + { + "epoch": 3.052130553037171, + "grad_norm": 0.22467663141054062, + "learning_rate": 3.1625345459843514e-05, + "loss": 0.8796, + "step": 6733 + }, + { + "epoch": 3.0525838621940165, + "grad_norm": 0.21438806490850498, + "learning_rate": 3.161296765283878e-05, + "loss": 0.8814, + "step": 6734 + }, + { + "epoch": 3.0530371713508613, + "grad_norm": 0.21534688998913726, + "learning_rate": 3.160059068581999e-05, + "loss": 0.8709, + "step": 6735 + }, + { + "epoch": 3.053490480507706, + "grad_norm": 0.20869449764861464, + "learning_rate": 3.158821456002672e-05, + "loss": 0.8753, + "step": 6736 + }, + { + "epoch": 3.0539437896645514, + "grad_norm": 0.21494139267052598, + "learning_rate": 3.1575839276698474e-05, + "loss": 0.8771, + "step": 6737 + }, + { + "epoch": 3.054397098821396, + "grad_norm": 0.275035463554469, + "learning_rate": 3.15634648370747e-05, + "loss": 0.8603, + "step": 6738 + }, + { + "epoch": 3.054850407978241, + "grad_norm": 0.2716266419644364, + "learning_rate": 3.1551091242394726e-05, + "loss": 0.8735, + "step": 6739 + }, + { + "epoch": 3.0553037171350863, + "grad_norm": 0.2539460167247427, + "learning_rate": 3.153871849389778e-05, + "loss": 0.8857, + "step": 6740 + }, + { + "epoch": 3.055757026291931, + "grad_norm": 0.2187160088718892, + "learning_rate": 3.152634659282305e-05, + "loss": 0.8626, + "step": 6741 + }, + { + "epoch": 3.056210335448776, + "grad_norm": 0.188029512790935, + "learning_rate": 3.15139755404096e-05, + "loss": 0.8735, + "step": 6742 + }, + { + "epoch": 3.0566636446056212, + "grad_norm": 0.2465515250303918, + "learning_rate": 3.150160533789644e-05, + "loss": 0.8825, + "step": 6743 + }, + { + "epoch": 3.057116953762466, + "grad_norm": 0.21502519166106424, + "learning_rate": 3.14892359865225e-05, + "loss": 0.8743, + "step": 6744 + }, + { + "epoch": 3.057570262919311, + "grad_norm": 0.2293548296359317, + "learning_rate": 3.147686748752658e-05, + "loss": 0.863, + "step": 6745 + }, + { + "epoch": 3.0580235720761557, + "grad_norm": 0.23965198175725475, + "learning_rate": 3.1464499842147424e-05, + "loss": 0.858, + "step": 6746 + }, + { + "epoch": 3.058476881233001, + "grad_norm": 0.24011312586749217, + "learning_rate": 3.14521330516237e-05, + "loss": 0.8623, + "step": 6747 + }, + { + "epoch": 3.058930190389846, + "grad_norm": 0.23511051919904577, + "learning_rate": 3.143976711719397e-05, + "loss": 0.8669, + "step": 6748 + }, + { + "epoch": 3.0593834995466906, + "grad_norm": 0.279583751988113, + "learning_rate": 3.142740204009671e-05, + "loss": 0.8658, + "step": 6749 + }, + { + "epoch": 3.059836808703536, + "grad_norm": 0.25481072534588156, + "learning_rate": 3.141503782157035e-05, + "loss": 0.8812, + "step": 6750 + }, + { + "epoch": 3.0602901178603807, + "grad_norm": 0.2180233016209016, + "learning_rate": 3.1402674462853164e-05, + "loss": 0.8685, + "step": 6751 + }, + { + "epoch": 3.0607434270172256, + "grad_norm": 0.27888857533477246, + "learning_rate": 3.139031196518341e-05, + "loss": 0.884, + "step": 6752 + }, + { + "epoch": 3.061196736174071, + "grad_norm": 0.2559658022227573, + "learning_rate": 3.137795032979922e-05, + "loss": 0.8778, + "step": 6753 + }, + { + "epoch": 3.0616500453309157, + "grad_norm": 0.2713691791663324, + "learning_rate": 3.136558955793863e-05, + "loss": 0.8612, + "step": 6754 + }, + { + "epoch": 3.0621033544877605, + "grad_norm": 0.34376295093821213, + "learning_rate": 3.1353229650839635e-05, + "loss": 0.8534, + "step": 6755 + }, + { + "epoch": 3.0625566636446058, + "grad_norm": 0.2974694040281412, + "learning_rate": 3.1340870609740104e-05, + "loss": 0.8474, + "step": 6756 + }, + { + "epoch": 3.0630099728014506, + "grad_norm": 0.28312295862102116, + "learning_rate": 3.132851243587782e-05, + "loss": 0.8494, + "step": 6757 + }, + { + "epoch": 3.0634632819582954, + "grad_norm": 0.23205934417820312, + "learning_rate": 3.131615513049051e-05, + "loss": 0.8631, + "step": 6758 + }, + { + "epoch": 3.0639165911151407, + "grad_norm": 0.2554512529013222, + "learning_rate": 3.130379869481578e-05, + "loss": 0.8527, + "step": 6759 + }, + { + "epoch": 3.0643699002719855, + "grad_norm": 0.25158765321869175, + "learning_rate": 3.129144313009118e-05, + "loss": 0.8589, + "step": 6760 + }, + { + "epoch": 3.0648232094288304, + "grad_norm": 0.281872158669947, + "learning_rate": 3.1279088437554124e-05, + "loss": 0.8511, + "step": 6761 + }, + { + "epoch": 3.0652765185856756, + "grad_norm": 0.34480079058763563, + "learning_rate": 3.1266734618442e-05, + "loss": 0.8784, + "step": 6762 + }, + { + "epoch": 3.0657298277425205, + "grad_norm": 0.3596951866366137, + "learning_rate": 3.125438167399207e-05, + "loss": 0.8798, + "step": 6763 + }, + { + "epoch": 3.0661831368993653, + "grad_norm": 0.3307677562570365, + "learning_rate": 3.1242029605441515e-05, + "loss": 0.8375, + "step": 6764 + }, + { + "epoch": 3.0666364460562106, + "grad_norm": 0.2778718993836123, + "learning_rate": 3.122967841402744e-05, + "loss": 0.8572, + "step": 6765 + }, + { + "epoch": 3.0670897552130554, + "grad_norm": 0.2671070914465489, + "learning_rate": 3.121732810098684e-05, + "loss": 0.8916, + "step": 6766 + }, + { + "epoch": 3.0675430643699, + "grad_norm": 0.25663876191191215, + "learning_rate": 3.120497866755665e-05, + "loss": 0.871, + "step": 6767 + }, + { + "epoch": 3.067996373526745, + "grad_norm": 0.22022628875884473, + "learning_rate": 3.119263011497369e-05, + "loss": 0.8547, + "step": 6768 + }, + { + "epoch": 3.0684496826835903, + "grad_norm": 0.33159800429536174, + "learning_rate": 3.1180282444474706e-05, + "loss": 0.8698, + "step": 6769 + }, + { + "epoch": 3.068902991840435, + "grad_norm": 0.33985993996552205, + "learning_rate": 3.1167935657296374e-05, + "loss": 0.8538, + "step": 6770 + }, + { + "epoch": 3.06935630099728, + "grad_norm": 0.2906384918099532, + "learning_rate": 3.1155589754675226e-05, + "loss": 0.8618, + "step": 6771 + }, + { + "epoch": 3.0698096101541252, + "grad_norm": 0.22472147547669347, + "learning_rate": 3.114324473784776e-05, + "loss": 0.8581, + "step": 6772 + }, + { + "epoch": 3.07026291931097, + "grad_norm": 0.20501334517293598, + "learning_rate": 3.1130900608050364e-05, + "loss": 0.881, + "step": 6773 + }, + { + "epoch": 3.070716228467815, + "grad_norm": 0.30259799641408597, + "learning_rate": 3.1118557366519335e-05, + "loss": 0.8519, + "step": 6774 + }, + { + "epoch": 3.07116953762466, + "grad_norm": 0.39089966854646074, + "learning_rate": 3.110621501449089e-05, + "loss": 0.8774, + "step": 6775 + }, + { + "epoch": 3.071622846781505, + "grad_norm": 0.3869260766776637, + "learning_rate": 3.109387355320115e-05, + "loss": 0.8598, + "step": 6776 + }, + { + "epoch": 3.07207615593835, + "grad_norm": 0.2369311024203603, + "learning_rate": 3.108153298388616e-05, + "loss": 0.8455, + "step": 6777 + }, + { + "epoch": 3.072529465095195, + "grad_norm": 0.19702397504559563, + "learning_rate": 3.106919330778184e-05, + "loss": 0.8687, + "step": 6778 + }, + { + "epoch": 3.07298277425204, + "grad_norm": 0.2445466720302129, + "learning_rate": 3.105685452612406e-05, + "loss": 0.9014, + "step": 6779 + }, + { + "epoch": 3.0734360834088847, + "grad_norm": 0.30777546600132955, + "learning_rate": 3.104451664014858e-05, + "loss": 0.8579, + "step": 6780 + }, + { + "epoch": 3.07388939256573, + "grad_norm": 0.37916253153046603, + "learning_rate": 3.103217965109107e-05, + "loss": 0.8676, + "step": 6781 + }, + { + "epoch": 3.074342701722575, + "grad_norm": 0.3219578970098882, + "learning_rate": 3.101984356018714e-05, + "loss": 0.8821, + "step": 6782 + }, + { + "epoch": 3.0747960108794197, + "grad_norm": 0.22656272095002375, + "learning_rate": 3.1007508368672255e-05, + "loss": 0.8638, + "step": 6783 + }, + { + "epoch": 3.075249320036265, + "grad_norm": 0.2721147590378564, + "learning_rate": 3.099517407778184e-05, + "loss": 0.8657, + "step": 6784 + }, + { + "epoch": 3.0757026291931098, + "grad_norm": 0.2801104930863645, + "learning_rate": 3.098284068875121e-05, + "loss": 0.8576, + "step": 6785 + }, + { + "epoch": 3.0761559383499546, + "grad_norm": 0.29389991773917234, + "learning_rate": 3.0970508202815595e-05, + "loss": 0.8876, + "step": 6786 + }, + { + "epoch": 3.0766092475067994, + "grad_norm": 0.25660024479018345, + "learning_rate": 3.095817662121011e-05, + "loss": 0.8682, + "step": 6787 + }, + { + "epoch": 3.0770625566636447, + "grad_norm": 0.23113169305422215, + "learning_rate": 3.0945845945169806e-05, + "loss": 0.8755, + "step": 6788 + }, + { + "epoch": 3.0775158658204895, + "grad_norm": 0.26741389644533653, + "learning_rate": 3.0933516175929646e-05, + "loss": 0.8889, + "step": 6789 + }, + { + "epoch": 3.0779691749773344, + "grad_norm": 0.321211726324061, + "learning_rate": 3.0921187314724486e-05, + "loss": 0.87, + "step": 6790 + }, + { + "epoch": 3.0784224841341796, + "grad_norm": 0.2884657927696219, + "learning_rate": 3.090885936278909e-05, + "loss": 0.8548, + "step": 6791 + }, + { + "epoch": 3.0788757932910245, + "grad_norm": 0.2734440470734266, + "learning_rate": 3.089653232135815e-05, + "loss": 0.8787, + "step": 6792 + }, + { + "epoch": 3.0793291024478693, + "grad_norm": 0.3457316901955548, + "learning_rate": 3.0884206191666264e-05, + "loss": 0.8507, + "step": 6793 + }, + { + "epoch": 3.0797824116047146, + "grad_norm": 0.3475809435760021, + "learning_rate": 3.087188097494791e-05, + "loss": 0.8767, + "step": 6794 + }, + { + "epoch": 3.0802357207615594, + "grad_norm": 0.32299222124258553, + "learning_rate": 3.0859556672437507e-05, + "loss": 0.8642, + "step": 6795 + }, + { + "epoch": 3.080689029918404, + "grad_norm": 0.30149421657925407, + "learning_rate": 3.0847233285369366e-05, + "loss": 0.8605, + "step": 6796 + }, + { + "epoch": 3.0811423390752495, + "grad_norm": 0.2323490755996479, + "learning_rate": 3.0834910814977716e-05, + "loss": 0.8741, + "step": 6797 + }, + { + "epoch": 3.0815956482320943, + "grad_norm": 0.2592935781536578, + "learning_rate": 3.0822589262496685e-05, + "loss": 0.8738, + "step": 6798 + }, + { + "epoch": 3.082048957388939, + "grad_norm": 0.2849352562102695, + "learning_rate": 3.08102686291603e-05, + "loss": 0.8518, + "step": 6799 + }, + { + "epoch": 3.0825022665457844, + "grad_norm": 0.2536403146790516, + "learning_rate": 3.079794891620252e-05, + "loss": 0.8773, + "step": 6800 + }, + { + "epoch": 3.0829555757026292, + "grad_norm": 0.27245306849528644, + "learning_rate": 3.078563012485722e-05, + "loss": 0.856, + "step": 6801 + }, + { + "epoch": 3.083408884859474, + "grad_norm": 0.28374882079483804, + "learning_rate": 3.077331225635812e-05, + "loss": 0.8676, + "step": 6802 + }, + { + "epoch": 3.0838621940163193, + "grad_norm": 0.2995497432608759, + "learning_rate": 3.0760995311938916e-05, + "loss": 0.8647, + "step": 6803 + }, + { + "epoch": 3.084315503173164, + "grad_norm": 0.38580238306447145, + "learning_rate": 3.0748679292833197e-05, + "loss": 0.8531, + "step": 6804 + }, + { + "epoch": 3.084768812330009, + "grad_norm": 0.35071350904459303, + "learning_rate": 3.073636420027443e-05, + "loss": 0.8618, + "step": 6805 + }, + { + "epoch": 3.0852221214868543, + "grad_norm": 0.27843892042802165, + "learning_rate": 3.072405003549601e-05, + "loss": 0.8922, + "step": 6806 + }, + { + "epoch": 3.085675430643699, + "grad_norm": 0.21875036802264353, + "learning_rate": 3.0711736799731244e-05, + "loss": 0.8557, + "step": 6807 + }, + { + "epoch": 3.086128739800544, + "grad_norm": 0.2649735591384151, + "learning_rate": 3.069942449421333e-05, + "loss": 0.8742, + "step": 6808 + }, + { + "epoch": 3.0865820489573887, + "grad_norm": 0.2960817306309814, + "learning_rate": 3.068711312017539e-05, + "loss": 0.8628, + "step": 6809 + }, + { + "epoch": 3.087035358114234, + "grad_norm": 0.3700182735228946, + "learning_rate": 3.067480267885044e-05, + "loss": 0.8541, + "step": 6810 + }, + { + "epoch": 3.087488667271079, + "grad_norm": 0.4423318544911209, + "learning_rate": 3.066249317147141e-05, + "loss": 0.8594, + "step": 6811 + }, + { + "epoch": 3.0879419764279237, + "grad_norm": 0.4192059070514196, + "learning_rate": 3.0650184599271144e-05, + "loss": 0.8588, + "step": 6812 + }, + { + "epoch": 3.088395285584769, + "grad_norm": 0.293938115986666, + "learning_rate": 3.0637876963482354e-05, + "loss": 0.8716, + "step": 6813 + }, + { + "epoch": 3.0888485947416138, + "grad_norm": 0.22587641925547594, + "learning_rate": 3.0625570265337704e-05, + "loss": 0.8679, + "step": 6814 + }, + { + "epoch": 3.0893019038984586, + "grad_norm": 0.4120045515471505, + "learning_rate": 3.0613264506069736e-05, + "loss": 0.8603, + "step": 6815 + }, + { + "epoch": 3.089755213055304, + "grad_norm": 0.43371531811405617, + "learning_rate": 3.0600959686910945e-05, + "loss": 0.8736, + "step": 6816 + }, + { + "epoch": 3.0902085222121487, + "grad_norm": 0.28241899628509554, + "learning_rate": 3.0588655809093634e-05, + "loss": 0.8785, + "step": 6817 + }, + { + "epoch": 3.0906618313689935, + "grad_norm": 0.3213665916410064, + "learning_rate": 3.057635287385011e-05, + "loss": 0.8756, + "step": 6818 + }, + { + "epoch": 3.091115140525839, + "grad_norm": 0.3184283888041792, + "learning_rate": 3.056405088241254e-05, + "loss": 0.8653, + "step": 6819 + }, + { + "epoch": 3.0915684496826836, + "grad_norm": 0.24828307499568866, + "learning_rate": 3.055174983601301e-05, + "loss": 0.8605, + "step": 6820 + }, + { + "epoch": 3.0920217588395285, + "grad_norm": 0.27511479345344436, + "learning_rate": 3.05394497358835e-05, + "loss": 0.8611, + "step": 6821 + }, + { + "epoch": 3.0924750679963737, + "grad_norm": 0.3206367546513383, + "learning_rate": 3.0527150583255894e-05, + "loss": 0.8906, + "step": 6822 + }, + { + "epoch": 3.0929283771532186, + "grad_norm": 0.2723101010624193, + "learning_rate": 3.051485237936201e-05, + "loss": 0.863, + "step": 6823 + }, + { + "epoch": 3.0933816863100634, + "grad_norm": 0.21277987931940595, + "learning_rate": 3.050255512543353e-05, + "loss": 0.8578, + "step": 6824 + }, + { + "epoch": 3.093834995466908, + "grad_norm": 0.2593615636408207, + "learning_rate": 3.049025882270207e-05, + "loss": 0.8569, + "step": 6825 + }, + { + "epoch": 3.0942883046237535, + "grad_norm": 0.2569106489740553, + "learning_rate": 3.047796347239913e-05, + "loss": 0.8695, + "step": 6826 + }, + { + "epoch": 3.0947416137805983, + "grad_norm": 0.25642120103404265, + "learning_rate": 3.0465669075756147e-05, + "loss": 0.8585, + "step": 6827 + }, + { + "epoch": 3.095194922937443, + "grad_norm": 0.3045619657923179, + "learning_rate": 3.045337563400442e-05, + "loss": 0.8516, + "step": 6828 + }, + { + "epoch": 3.0956482320942884, + "grad_norm": 0.33117620573644785, + "learning_rate": 3.0441083148375177e-05, + "loss": 0.8651, + "step": 6829 + }, + { + "epoch": 3.0961015412511332, + "grad_norm": 0.3844423726419092, + "learning_rate": 3.042879162009955e-05, + "loss": 0.8734, + "step": 6830 + }, + { + "epoch": 3.096554850407978, + "grad_norm": 0.31604546131645905, + "learning_rate": 3.0416501050408573e-05, + "loss": 0.8637, + "step": 6831 + }, + { + "epoch": 3.0970081595648233, + "grad_norm": 0.879526639381843, + "learning_rate": 3.0404211440533182e-05, + "loss": 0.8569, + "step": 6832 + }, + { + "epoch": 3.097461468721668, + "grad_norm": 0.24245485349842658, + "learning_rate": 3.0391922791704216e-05, + "loss": 0.8554, + "step": 6833 + }, + { + "epoch": 3.097914777878513, + "grad_norm": 0.23635504568142499, + "learning_rate": 3.0379635105152417e-05, + "loss": 0.8555, + "step": 6834 + }, + { + "epoch": 3.0983680870353583, + "grad_norm": 0.35372935677364364, + "learning_rate": 3.036734838210846e-05, + "loss": 0.8765, + "step": 6835 + }, + { + "epoch": 3.098821396192203, + "grad_norm": 0.2811755392874431, + "learning_rate": 3.0355062623802848e-05, + "loss": 0.8816, + "step": 6836 + }, + { + "epoch": 3.099274705349048, + "grad_norm": 0.2632277497800812, + "learning_rate": 3.0342777831466065e-05, + "loss": 0.8621, + "step": 6837 + }, + { + "epoch": 3.099728014505893, + "grad_norm": 0.35722221465524673, + "learning_rate": 3.0330494006328458e-05, + "loss": 0.8935, + "step": 6838 + }, + { + "epoch": 3.100181323662738, + "grad_norm": 0.3556399812491913, + "learning_rate": 3.0318211149620304e-05, + "loss": 0.8813, + "step": 6839 + }, + { + "epoch": 3.100634632819583, + "grad_norm": 0.26161295223227843, + "learning_rate": 3.0305929262571747e-05, + "loss": 0.8777, + "step": 6840 + }, + { + "epoch": 3.101087941976428, + "grad_norm": 0.3355366947884857, + "learning_rate": 3.0293648346412872e-05, + "loss": 0.8549, + "step": 6841 + }, + { + "epoch": 3.101541251133273, + "grad_norm": 0.4443215306244027, + "learning_rate": 3.0281368402373647e-05, + "loss": 0.8667, + "step": 6842 + }, + { + "epoch": 3.1019945602901178, + "grad_norm": 0.3643025077238908, + "learning_rate": 3.0269089431683932e-05, + "loss": 0.8671, + "step": 6843 + }, + { + "epoch": 3.102447869446963, + "grad_norm": 0.3141456991150922, + "learning_rate": 3.0256811435573508e-05, + "loss": 0.8681, + "step": 6844 + }, + { + "epoch": 3.102901178603808, + "grad_norm": 0.2532473306658191, + "learning_rate": 3.0244534415272056e-05, + "loss": 0.8842, + "step": 6845 + }, + { + "epoch": 3.1033544877606527, + "grad_norm": 0.22159979574702565, + "learning_rate": 3.0232258372009153e-05, + "loss": 0.8855, + "step": 6846 + }, + { + "epoch": 3.1038077969174975, + "grad_norm": 0.21725910367517562, + "learning_rate": 3.0219983307014277e-05, + "loss": 0.8728, + "step": 6847 + }, + { + "epoch": 3.104261106074343, + "grad_norm": 0.2818716071817324, + "learning_rate": 3.0207709221516815e-05, + "loss": 0.8778, + "step": 6848 + }, + { + "epoch": 3.1047144152311876, + "grad_norm": 0.3701584554484048, + "learning_rate": 3.019543611674606e-05, + "loss": 0.8914, + "step": 6849 + }, + { + "epoch": 3.1051677243880325, + "grad_norm": 0.2735425954876108, + "learning_rate": 3.018316399393119e-05, + "loss": 0.8581, + "step": 6850 + }, + { + "epoch": 3.1056210335448777, + "grad_norm": 0.2152253826677586, + "learning_rate": 3.0170892854301294e-05, + "loss": 0.8831, + "step": 6851 + }, + { + "epoch": 3.1060743427017226, + "grad_norm": 0.27921545078287807, + "learning_rate": 3.0158622699085368e-05, + "loss": 0.8765, + "step": 6852 + }, + { + "epoch": 3.1065276518585674, + "grad_norm": 0.2863374074039194, + "learning_rate": 3.0146353529512315e-05, + "loss": 0.8741, + "step": 6853 + }, + { + "epoch": 3.1069809610154127, + "grad_norm": 0.31208843083877, + "learning_rate": 3.0134085346810907e-05, + "loss": 0.8761, + "step": 6854 + }, + { + "epoch": 3.1074342701722575, + "grad_norm": 0.2097766456913727, + "learning_rate": 3.012181815220984e-05, + "loss": 0.8723, + "step": 6855 + }, + { + "epoch": 3.1078875793291023, + "grad_norm": 0.27810299827040563, + "learning_rate": 3.0109551946937722e-05, + "loss": 0.8822, + "step": 6856 + }, + { + "epoch": 3.1083408884859476, + "grad_norm": 0.20210495366346679, + "learning_rate": 3.0097286732223038e-05, + "loss": 0.8755, + "step": 6857 + }, + { + "epoch": 3.1087941976427924, + "grad_norm": 0.26804787417995574, + "learning_rate": 3.0085022509294207e-05, + "loss": 0.8656, + "step": 6858 + }, + { + "epoch": 3.1092475067996372, + "grad_norm": 0.3211735849155731, + "learning_rate": 3.00727592793795e-05, + "loss": 0.8545, + "step": 6859 + }, + { + "epoch": 3.1097008159564825, + "grad_norm": 0.2637493716534845, + "learning_rate": 3.0060497043707123e-05, + "loss": 0.8724, + "step": 6860 + }, + { + "epoch": 3.1101541251133273, + "grad_norm": 0.19725349138691253, + "learning_rate": 3.004823580350518e-05, + "loss": 0.8618, + "step": 6861 + }, + { + "epoch": 3.110607434270172, + "grad_norm": 0.28347910242781066, + "learning_rate": 3.003597556000167e-05, + "loss": 0.8843, + "step": 6862 + }, + { + "epoch": 3.1110607434270174, + "grad_norm": 0.2384525941155289, + "learning_rate": 3.002371631442449e-05, + "loss": 0.8753, + "step": 6863 + }, + { + "epoch": 3.1115140525838623, + "grad_norm": 0.2759544014509861, + "learning_rate": 3.0011458068001446e-05, + "loss": 0.8822, + "step": 6864 + }, + { + "epoch": 3.111967361740707, + "grad_norm": 0.2358599500585341, + "learning_rate": 2.9999200821960226e-05, + "loss": 0.8606, + "step": 6865 + }, + { + "epoch": 3.112420670897552, + "grad_norm": 0.23302433639714829, + "learning_rate": 2.998694457752844e-05, + "loss": 0.8778, + "step": 6866 + }, + { + "epoch": 3.112873980054397, + "grad_norm": 0.2299689364446447, + "learning_rate": 2.997468933593358e-05, + "loss": 0.8681, + "step": 6867 + }, + { + "epoch": 3.113327289211242, + "grad_norm": 0.23938443684096947, + "learning_rate": 2.9962435098403055e-05, + "loss": 0.8611, + "step": 6868 + }, + { + "epoch": 3.113780598368087, + "grad_norm": 0.2838973860665008, + "learning_rate": 2.995018186616415e-05, + "loss": 0.8637, + "step": 6869 + }, + { + "epoch": 3.114233907524932, + "grad_norm": 0.21807596184496125, + "learning_rate": 2.9937929640444064e-05, + "loss": 0.8766, + "step": 6870 + }, + { + "epoch": 3.114687216681777, + "grad_norm": 0.25863876368555316, + "learning_rate": 2.992567842246989e-05, + "loss": 0.8561, + "step": 6871 + }, + { + "epoch": 3.1151405258386218, + "grad_norm": 0.3548794123765615, + "learning_rate": 2.991342821346864e-05, + "loss": 0.8525, + "step": 6872 + }, + { + "epoch": 3.115593834995467, + "grad_norm": 0.28561014570436977, + "learning_rate": 2.9901179014667213e-05, + "loss": 0.8894, + "step": 6873 + }, + { + "epoch": 3.116047144152312, + "grad_norm": 0.23178391763206013, + "learning_rate": 2.988893082729237e-05, + "loss": 0.8732, + "step": 6874 + }, + { + "epoch": 3.1165004533091567, + "grad_norm": 0.24311129148069888, + "learning_rate": 2.987668365257082e-05, + "loss": 0.8613, + "step": 6875 + }, + { + "epoch": 3.116953762466002, + "grad_norm": 0.2832967871461975, + "learning_rate": 2.986443749172915e-05, + "loss": 0.8699, + "step": 6876 + }, + { + "epoch": 3.117407071622847, + "grad_norm": 0.2695449057594331, + "learning_rate": 2.9852192345993857e-05, + "loss": 0.8463, + "step": 6877 + }, + { + "epoch": 3.1178603807796916, + "grad_norm": 0.22375048019439067, + "learning_rate": 2.9839948216591326e-05, + "loss": 0.8714, + "step": 6878 + }, + { + "epoch": 3.118313689936537, + "grad_norm": 0.19805607525623262, + "learning_rate": 2.982770510474784e-05, + "loss": 0.8731, + "step": 6879 + }, + { + "epoch": 3.1187669990933817, + "grad_norm": 0.20963506475752908, + "learning_rate": 2.981546301168958e-05, + "loss": 0.8879, + "step": 6880 + }, + { + "epoch": 3.1192203082502266, + "grad_norm": 0.3017468522612877, + "learning_rate": 2.9803221938642637e-05, + "loss": 0.8548, + "step": 6881 + }, + { + "epoch": 3.119673617407072, + "grad_norm": 0.30333019751281254, + "learning_rate": 2.979098188683298e-05, + "loss": 0.871, + "step": 6882 + }, + { + "epoch": 3.1201269265639167, + "grad_norm": 0.20599326589431943, + "learning_rate": 2.9778742857486487e-05, + "loss": 0.8688, + "step": 6883 + }, + { + "epoch": 3.1205802357207615, + "grad_norm": 0.23675134142309234, + "learning_rate": 2.976650485182895e-05, + "loss": 0.871, + "step": 6884 + }, + { + "epoch": 3.1210335448776063, + "grad_norm": 0.1882228290424865, + "learning_rate": 2.9754267871086024e-05, + "loss": 0.842, + "step": 6885 + }, + { + "epoch": 3.1214868540344516, + "grad_norm": 0.2705946780293412, + "learning_rate": 2.9742031916483278e-05, + "loss": 0.8592, + "step": 6886 + }, + { + "epoch": 3.1219401631912964, + "grad_norm": 0.2936303932848504, + "learning_rate": 2.9729796989246185e-05, + "loss": 0.8499, + "step": 6887 + }, + { + "epoch": 3.1223934723481412, + "grad_norm": 0.33988270868072173, + "learning_rate": 2.9717563090600108e-05, + "loss": 0.8745, + "step": 6888 + }, + { + "epoch": 3.1228467815049865, + "grad_norm": 0.29108229119724444, + "learning_rate": 2.97053302217703e-05, + "loss": 0.8765, + "step": 6889 + }, + { + "epoch": 3.1233000906618313, + "grad_norm": 0.3076975484607649, + "learning_rate": 2.969309838398193e-05, + "loss": 0.8517, + "step": 6890 + }, + { + "epoch": 3.123753399818676, + "grad_norm": 0.2982449980796256, + "learning_rate": 2.968086757846005e-05, + "loss": 0.8625, + "step": 6891 + }, + { + "epoch": 3.1242067089755214, + "grad_norm": 0.20199119287479758, + "learning_rate": 2.9668637806429607e-05, + "loss": 0.8717, + "step": 6892 + }, + { + "epoch": 3.1246600181323663, + "grad_norm": 0.23056446457146773, + "learning_rate": 2.9656409069115467e-05, + "loss": 0.8667, + "step": 6893 + }, + { + "epoch": 3.125113327289211, + "grad_norm": 0.3505697114335065, + "learning_rate": 2.964418136774234e-05, + "loss": 0.857, + "step": 6894 + }, + { + "epoch": 3.1255666364460564, + "grad_norm": 0.2653925856809053, + "learning_rate": 2.963195470353489e-05, + "loss": 0.862, + "step": 6895 + }, + { + "epoch": 3.126019945602901, + "grad_norm": 0.25406560786069815, + "learning_rate": 2.9619729077717644e-05, + "loss": 0.8627, + "step": 6896 + }, + { + "epoch": 3.126473254759746, + "grad_norm": 0.3538304461659733, + "learning_rate": 2.9607504491515037e-05, + "loss": 0.8521, + "step": 6897 + }, + { + "epoch": 3.1269265639165913, + "grad_norm": 0.30162802017571055, + "learning_rate": 2.9595280946151404e-05, + "loss": 0.8776, + "step": 6898 + }, + { + "epoch": 3.127379873073436, + "grad_norm": 0.2837298016404098, + "learning_rate": 2.9583058442850966e-05, + "loss": 0.8559, + "step": 6899 + }, + { + "epoch": 3.127833182230281, + "grad_norm": 0.3118800284909707, + "learning_rate": 2.9570836982837828e-05, + "loss": 0.8632, + "step": 6900 + }, + { + "epoch": 3.1282864913871262, + "grad_norm": 0.2380000070623176, + "learning_rate": 2.9558616567336026e-05, + "loss": 0.856, + "step": 6901 + }, + { + "epoch": 3.128739800543971, + "grad_norm": 0.22886988593491253, + "learning_rate": 2.9546397197569456e-05, + "loss": 0.8653, + "step": 6902 + }, + { + "epoch": 3.129193109700816, + "grad_norm": 0.22762245043049278, + "learning_rate": 2.9534178874761932e-05, + "loss": 0.8703, + "step": 6903 + }, + { + "epoch": 3.1296464188576607, + "grad_norm": 0.21923899710381167, + "learning_rate": 2.9521961600137147e-05, + "loss": 0.8713, + "step": 6904 + }, + { + "epoch": 3.130099728014506, + "grad_norm": 0.2558153359715103, + "learning_rate": 2.9509745374918707e-05, + "loss": 0.8638, + "step": 6905 + }, + { + "epoch": 3.130553037171351, + "grad_norm": 0.2869348030842721, + "learning_rate": 2.9497530200330094e-05, + "loss": 0.8668, + "step": 6906 + }, + { + "epoch": 3.1310063463281956, + "grad_norm": 0.24925063269858783, + "learning_rate": 2.9485316077594707e-05, + "loss": 0.8629, + "step": 6907 + }, + { + "epoch": 3.131459655485041, + "grad_norm": 0.24097197756682648, + "learning_rate": 2.9473103007935803e-05, + "loss": 0.8623, + "step": 6908 + }, + { + "epoch": 3.1319129646418857, + "grad_norm": 0.2964704253887973, + "learning_rate": 2.946089099257658e-05, + "loss": 0.8644, + "step": 6909 + }, + { + "epoch": 3.1323662737987306, + "grad_norm": 0.24902393049512392, + "learning_rate": 2.9448680032740104e-05, + "loss": 0.8746, + "step": 6910 + }, + { + "epoch": 3.132819582955576, + "grad_norm": 0.24108034620988752, + "learning_rate": 2.9436470129649322e-05, + "loss": 0.8635, + "step": 6911 + }, + { + "epoch": 3.1332728921124207, + "grad_norm": 0.30161949580671366, + "learning_rate": 2.9424261284527117e-05, + "loss": 0.8638, + "step": 6912 + }, + { + "epoch": 3.1337262012692655, + "grad_norm": 0.21399402689817354, + "learning_rate": 2.941205349859621e-05, + "loss": 0.8737, + "step": 6913 + }, + { + "epoch": 3.1341795104261108, + "grad_norm": 0.27094288476685724, + "learning_rate": 2.939984677307927e-05, + "loss": 0.8877, + "step": 6914 + }, + { + "epoch": 3.1346328195829556, + "grad_norm": 0.31247049854310194, + "learning_rate": 2.9387641109198832e-05, + "loss": 0.8686, + "step": 6915 + }, + { + "epoch": 3.1350861287398004, + "grad_norm": 0.19539652709328992, + "learning_rate": 2.9375436508177317e-05, + "loss": 0.8548, + "step": 6916 + }, + { + "epoch": 3.1355394378966457, + "grad_norm": 0.24278909466095572, + "learning_rate": 2.9363232971237062e-05, + "loss": 0.8492, + "step": 6917 + }, + { + "epoch": 3.1359927470534905, + "grad_norm": 0.30110492259604843, + "learning_rate": 2.9351030499600286e-05, + "loss": 0.8699, + "step": 6918 + }, + { + "epoch": 3.1364460562103353, + "grad_norm": 0.29210823241251854, + "learning_rate": 2.9338829094489104e-05, + "loss": 0.8642, + "step": 6919 + }, + { + "epoch": 3.1368993653671806, + "grad_norm": 0.26470458692419097, + "learning_rate": 2.9326628757125517e-05, + "loss": 0.8502, + "step": 6920 + }, + { + "epoch": 3.1373526745240254, + "grad_norm": 0.27699128918947663, + "learning_rate": 2.9314429488731432e-05, + "loss": 0.8522, + "step": 6921 + }, + { + "epoch": 3.1378059836808703, + "grad_norm": 0.28893211730729623, + "learning_rate": 2.9302231290528635e-05, + "loss": 0.8649, + "step": 6922 + }, + { + "epoch": 3.1382592928377155, + "grad_norm": 0.2871337408216156, + "learning_rate": 2.9290034163738816e-05, + "loss": 0.8718, + "step": 6923 + }, + { + "epoch": 3.1387126019945604, + "grad_norm": 0.2061338152733656, + "learning_rate": 2.9277838109583553e-05, + "loss": 0.8799, + "step": 6924 + }, + { + "epoch": 3.139165911151405, + "grad_norm": 0.26905839298579576, + "learning_rate": 2.9265643129284317e-05, + "loss": 0.8741, + "step": 6925 + }, + { + "epoch": 3.13961922030825, + "grad_norm": 0.19164297169077849, + "learning_rate": 2.9253449224062472e-05, + "loss": 0.8458, + "step": 6926 + }, + { + "epoch": 3.1400725294650953, + "grad_norm": 0.2604966871007538, + "learning_rate": 2.9241256395139264e-05, + "loss": 0.8773, + "step": 6927 + }, + { + "epoch": 3.14052583862194, + "grad_norm": 0.2883671430918621, + "learning_rate": 2.9229064643735844e-05, + "loss": 0.8669, + "step": 6928 + }, + { + "epoch": 3.140979147778785, + "grad_norm": 0.268920614337893, + "learning_rate": 2.9216873971073263e-05, + "loss": 0.8761, + "step": 6929 + }, + { + "epoch": 3.1414324569356302, + "grad_norm": 0.24565496436619613, + "learning_rate": 2.920468437837244e-05, + "loss": 0.8767, + "step": 6930 + }, + { + "epoch": 3.141885766092475, + "grad_norm": 0.19426042430093968, + "learning_rate": 2.919249586685422e-05, + "loss": 0.8604, + "step": 6931 + }, + { + "epoch": 3.14233907524932, + "grad_norm": 0.17796039708324474, + "learning_rate": 2.918030843773929e-05, + "loss": 0.8571, + "step": 6932 + }, + { + "epoch": 3.142792384406165, + "grad_norm": 0.2421543320507795, + "learning_rate": 2.9168122092248264e-05, + "loss": 0.8767, + "step": 6933 + }, + { + "epoch": 3.14324569356301, + "grad_norm": 0.28116250681669, + "learning_rate": 2.9155936831601645e-05, + "loss": 0.864, + "step": 6934 + }, + { + "epoch": 3.143699002719855, + "grad_norm": 0.2523367884387069, + "learning_rate": 2.914375265701982e-05, + "loss": 0.8905, + "step": 6935 + }, + { + "epoch": 3.1441523118767, + "grad_norm": 0.19207956740480206, + "learning_rate": 2.9131569569723077e-05, + "loss": 0.8634, + "step": 6936 + }, + { + "epoch": 3.144605621033545, + "grad_norm": 0.24673915044478562, + "learning_rate": 2.9119387570931583e-05, + "loss": 0.8498, + "step": 6937 + }, + { + "epoch": 3.1450589301903897, + "grad_norm": 0.22934436090036062, + "learning_rate": 2.9107206661865393e-05, + "loss": 0.8593, + "step": 6938 + }, + { + "epoch": 3.145512239347235, + "grad_norm": 0.22207664731420784, + "learning_rate": 2.9095026843744477e-05, + "loss": 0.8718, + "step": 6939 + }, + { + "epoch": 3.14596554850408, + "grad_norm": 0.20847860716562241, + "learning_rate": 2.9082848117788683e-05, + "loss": 0.893, + "step": 6940 + }, + { + "epoch": 3.1464188576609247, + "grad_norm": 0.2365687492431127, + "learning_rate": 2.9070670485217722e-05, + "loss": 0.8663, + "step": 6941 + }, + { + "epoch": 3.1468721668177695, + "grad_norm": 0.198926620615235, + "learning_rate": 2.905849394725123e-05, + "loss": 0.8593, + "step": 6942 + }, + { + "epoch": 3.1473254759746148, + "grad_norm": 0.2464997330089236, + "learning_rate": 2.9046318505108727e-05, + "loss": 0.8584, + "step": 6943 + }, + { + "epoch": 3.1477787851314596, + "grad_norm": 0.19274254057838472, + "learning_rate": 2.9034144160009624e-05, + "loss": 0.8717, + "step": 6944 + }, + { + "epoch": 3.1482320942883044, + "grad_norm": 0.2455409167510631, + "learning_rate": 2.9021970913173202e-05, + "loss": 0.8802, + "step": 6945 + }, + { + "epoch": 3.1486854034451497, + "grad_norm": 0.24017678561981254, + "learning_rate": 2.9009798765818665e-05, + "loss": 0.8358, + "step": 6946 + }, + { + "epoch": 3.1491387126019945, + "grad_norm": 0.21186281780275526, + "learning_rate": 2.899762771916508e-05, + "loss": 0.8746, + "step": 6947 + }, + { + "epoch": 3.1495920217588393, + "grad_norm": 0.23794500124084023, + "learning_rate": 2.898545777443141e-05, + "loss": 0.8779, + "step": 6948 + }, + { + "epoch": 3.1500453309156846, + "grad_norm": 0.25705285942466, + "learning_rate": 2.8973288932836517e-05, + "loss": 0.845, + "step": 6949 + }, + { + "epoch": 3.1504986400725294, + "grad_norm": 0.2637930197674585, + "learning_rate": 2.8961121195599166e-05, + "loss": 0.8641, + "step": 6950 + }, + { + "epoch": 3.1509519492293743, + "grad_norm": 0.20625380347430503, + "learning_rate": 2.894895456393795e-05, + "loss": 0.8322, + "step": 6951 + }, + { + "epoch": 3.1514052583862195, + "grad_norm": 0.27409985533010894, + "learning_rate": 2.8936789039071408e-05, + "loss": 0.8728, + "step": 6952 + }, + { + "epoch": 3.1518585675430644, + "grad_norm": 0.29902423439141684, + "learning_rate": 2.892462462221796e-05, + "loss": 0.8768, + "step": 6953 + }, + { + "epoch": 3.152311876699909, + "grad_norm": 0.2603711946019893, + "learning_rate": 2.891246131459591e-05, + "loss": 0.8629, + "step": 6954 + }, + { + "epoch": 3.1527651858567545, + "grad_norm": 0.22087000658575628, + "learning_rate": 2.890029911742345e-05, + "loss": 0.8631, + "step": 6955 + }, + { + "epoch": 3.1532184950135993, + "grad_norm": 0.2318627341134867, + "learning_rate": 2.8888138031918647e-05, + "loss": 0.8582, + "step": 6956 + }, + { + "epoch": 3.153671804170444, + "grad_norm": 0.27081878757212274, + "learning_rate": 2.8875978059299478e-05, + "loss": 0.8515, + "step": 6957 + }, + { + "epoch": 3.1541251133272894, + "grad_norm": 0.28043712343437543, + "learning_rate": 2.8863819200783802e-05, + "loss": 0.8524, + "step": 6958 + }, + { + "epoch": 3.1545784224841342, + "grad_norm": 0.2562062438291131, + "learning_rate": 2.8851661457589357e-05, + "loss": 0.88, + "step": 6959 + }, + { + "epoch": 3.155031731640979, + "grad_norm": 0.21739776601864433, + "learning_rate": 2.8839504830933783e-05, + "loss": 0.8772, + "step": 6960 + }, + { + "epoch": 3.1554850407978243, + "grad_norm": 0.1883227322180692, + "learning_rate": 2.8827349322034596e-05, + "loss": 0.8752, + "step": 6961 + }, + { + "epoch": 3.155938349954669, + "grad_norm": 0.2637718834560836, + "learning_rate": 2.8815194932109216e-05, + "loss": 0.8892, + "step": 6962 + }, + { + "epoch": 3.156391659111514, + "grad_norm": 0.3055369425840871, + "learning_rate": 2.880304166237493e-05, + "loss": 0.8906, + "step": 6963 + }, + { + "epoch": 3.1568449682683593, + "grad_norm": 0.2687555341827451, + "learning_rate": 2.8790889514048936e-05, + "loss": 0.873, + "step": 6964 + }, + { + "epoch": 3.157298277425204, + "grad_norm": 0.20049359599370142, + "learning_rate": 2.8778738488348294e-05, + "loss": 0.8688, + "step": 6965 + }, + { + "epoch": 3.157751586582049, + "grad_norm": 0.25929527341187325, + "learning_rate": 2.8766588586489984e-05, + "loss": 0.8566, + "step": 6966 + }, + { + "epoch": 3.1582048957388937, + "grad_norm": 0.2549711915983522, + "learning_rate": 2.875443980969083e-05, + "loss": 0.8705, + "step": 6967 + }, + { + "epoch": 3.158658204895739, + "grad_norm": 0.2480678725472705, + "learning_rate": 2.8742292159167583e-05, + "loss": 0.88, + "step": 6968 + }, + { + "epoch": 3.159111514052584, + "grad_norm": 0.2933891121473232, + "learning_rate": 2.8730145636136866e-05, + "loss": 0.8781, + "step": 6969 + }, + { + "epoch": 3.1595648232094287, + "grad_norm": 0.27849999837111644, + "learning_rate": 2.871800024181518e-05, + "loss": 0.8611, + "step": 6970 + }, + { + "epoch": 3.160018132366274, + "grad_norm": 0.21029203569735633, + "learning_rate": 2.8705855977418927e-05, + "loss": 0.8891, + "step": 6971 + }, + { + "epoch": 3.1604714415231188, + "grad_norm": 0.2975711840370749, + "learning_rate": 2.869371284416439e-05, + "loss": 0.8531, + "step": 6972 + }, + { + "epoch": 3.1609247506799636, + "grad_norm": 0.3610182089677336, + "learning_rate": 2.8681570843267745e-05, + "loss": 0.8715, + "step": 6973 + }, + { + "epoch": 3.161378059836809, + "grad_norm": 0.3089989178348143, + "learning_rate": 2.8669429975945036e-05, + "loss": 0.8651, + "step": 6974 + }, + { + "epoch": 3.1618313689936537, + "grad_norm": 0.2466358012750128, + "learning_rate": 2.8657290243412217e-05, + "loss": 0.8505, + "step": 6975 + }, + { + "epoch": 3.1622846781504985, + "grad_norm": 0.28684692878251467, + "learning_rate": 2.8645151646885117e-05, + "loss": 0.8599, + "step": 6976 + }, + { + "epoch": 3.162737987307344, + "grad_norm": 0.2870145980034456, + "learning_rate": 2.8633014187579448e-05, + "loss": 0.8557, + "step": 6977 + }, + { + "epoch": 3.1631912964641886, + "grad_norm": 0.3287629815622556, + "learning_rate": 2.8620877866710816e-05, + "loss": 0.8811, + "step": 6978 + }, + { + "epoch": 3.1636446056210334, + "grad_norm": 0.2098727755082707, + "learning_rate": 2.860874268549471e-05, + "loss": 0.8533, + "step": 6979 + }, + { + "epoch": 3.1640979147778787, + "grad_norm": 0.27343561685517065, + "learning_rate": 2.8596608645146502e-05, + "loss": 0.8491, + "step": 6980 + }, + { + "epoch": 3.1645512239347235, + "grad_norm": 0.25700855470520845, + "learning_rate": 2.858447574688146e-05, + "loss": 0.8624, + "step": 6981 + }, + { + "epoch": 3.1650045330915684, + "grad_norm": 0.19862276437408424, + "learning_rate": 2.8572343991914713e-05, + "loss": 0.852, + "step": 6982 + }, + { + "epoch": 3.165457842248413, + "grad_norm": 0.2548989110723322, + "learning_rate": 2.85602133814613e-05, + "loss": 0.8796, + "step": 6983 + }, + { + "epoch": 3.1659111514052585, + "grad_norm": 0.3387943267318919, + "learning_rate": 2.854808391673614e-05, + "loss": 0.8688, + "step": 6984 + }, + { + "epoch": 3.1663644605621033, + "grad_norm": 0.2220967457759192, + "learning_rate": 2.8535955598954025e-05, + "loss": 0.8498, + "step": 6985 + }, + { + "epoch": 3.166817769718948, + "grad_norm": 0.247587230959362, + "learning_rate": 2.852382842932966e-05, + "loss": 0.8558, + "step": 6986 + }, + { + "epoch": 3.1672710788757934, + "grad_norm": 0.2799334187030419, + "learning_rate": 2.85117024090776e-05, + "loss": 0.8834, + "step": 6987 + }, + { + "epoch": 3.1677243880326382, + "grad_norm": 0.21661665103927905, + "learning_rate": 2.8499577539412303e-05, + "loss": 0.8837, + "step": 6988 + }, + { + "epoch": 3.168177697189483, + "grad_norm": 0.23548526065761005, + "learning_rate": 2.8487453821548138e-05, + "loss": 0.8825, + "step": 6989 + }, + { + "epoch": 3.1686310063463283, + "grad_norm": 0.31261066287782124, + "learning_rate": 2.847533125669929e-05, + "loss": 0.8658, + "step": 6990 + }, + { + "epoch": 3.169084315503173, + "grad_norm": 0.21016991904446714, + "learning_rate": 2.8463209846079886e-05, + "loss": 0.8695, + "step": 6991 + }, + { + "epoch": 3.169537624660018, + "grad_norm": 0.288324197450245, + "learning_rate": 2.845108959090392e-05, + "loss": 0.8717, + "step": 6992 + }, + { + "epoch": 3.1699909338168633, + "grad_norm": 0.3284733319247781, + "learning_rate": 2.8438970492385275e-05, + "loss": 0.8634, + "step": 6993 + }, + { + "epoch": 3.170444242973708, + "grad_norm": 0.25002844734240737, + "learning_rate": 2.842685255173771e-05, + "loss": 0.8446, + "step": 6994 + }, + { + "epoch": 3.170897552130553, + "grad_norm": 0.348679939161987, + "learning_rate": 2.841473577017488e-05, + "loss": 0.8646, + "step": 6995 + }, + { + "epoch": 3.171350861287398, + "grad_norm": 0.3684462226257516, + "learning_rate": 2.840262014891031e-05, + "loss": 0.8676, + "step": 6996 + }, + { + "epoch": 3.171804170444243, + "grad_norm": 0.22709302657326774, + "learning_rate": 2.8390505689157416e-05, + "loss": 0.8742, + "step": 6997 + }, + { + "epoch": 3.172257479601088, + "grad_norm": 0.27375909349028565, + "learning_rate": 2.83783923921295e-05, + "loss": 0.8725, + "step": 6998 + }, + { + "epoch": 3.172710788757933, + "grad_norm": 0.2935445723706622, + "learning_rate": 2.8366280259039736e-05, + "loss": 0.8811, + "step": 6999 + }, + { + "epoch": 3.173164097914778, + "grad_norm": 0.36118956440822103, + "learning_rate": 2.835416929110119e-05, + "loss": 0.8442, + "step": 7000 + }, + { + "epoch": 3.1736174070716228, + "grad_norm": 0.4061849292837669, + "learning_rate": 2.8342059489526824e-05, + "loss": 0.8632, + "step": 7001 + }, + { + "epoch": 3.174070716228468, + "grad_norm": 0.29801711546010046, + "learning_rate": 2.8329950855529458e-05, + "loss": 0.8884, + "step": 7002 + }, + { + "epoch": 3.174524025385313, + "grad_norm": 0.24528213775262087, + "learning_rate": 2.831784339032181e-05, + "loss": 0.8787, + "step": 7003 + }, + { + "epoch": 3.1749773345421577, + "grad_norm": 0.3336704111641633, + "learning_rate": 2.8305737095116484e-05, + "loss": 0.8572, + "step": 7004 + }, + { + "epoch": 3.1754306436990025, + "grad_norm": 0.2999662323022854, + "learning_rate": 2.8293631971125955e-05, + "loss": 0.8601, + "step": 7005 + }, + { + "epoch": 3.175883952855848, + "grad_norm": 0.24679009819821005, + "learning_rate": 2.8281528019562592e-05, + "loss": 0.8697, + "step": 7006 + }, + { + "epoch": 3.1763372620126926, + "grad_norm": 0.259289536775999, + "learning_rate": 2.8269425241638634e-05, + "loss": 0.8508, + "step": 7007 + }, + { + "epoch": 3.1767905711695374, + "grad_norm": 0.21066782706498954, + "learning_rate": 2.8257323638566227e-05, + "loss": 0.8643, + "step": 7008 + }, + { + "epoch": 3.1772438803263827, + "grad_norm": 0.19536342416825248, + "learning_rate": 2.824522321155736e-05, + "loss": 0.8634, + "step": 7009 + }, + { + "epoch": 3.1776971894832275, + "grad_norm": 0.23538785196555306, + "learning_rate": 2.823312396182393e-05, + "loss": 0.8806, + "step": 7010 + }, + { + "epoch": 3.1781504986400724, + "grad_norm": 0.2107859114316996, + "learning_rate": 2.8221025890577716e-05, + "loss": 0.8495, + "step": 7011 + }, + { + "epoch": 3.1786038077969176, + "grad_norm": 0.20962995785188374, + "learning_rate": 2.8208928999030387e-05, + "loss": 0.8811, + "step": 7012 + }, + { + "epoch": 3.1790571169537625, + "grad_norm": 0.18752409904204942, + "learning_rate": 2.8196833288393466e-05, + "loss": 0.8656, + "step": 7013 + }, + { + "epoch": 3.1795104261106073, + "grad_norm": 0.22172138333328842, + "learning_rate": 2.8184738759878375e-05, + "loss": 0.8546, + "step": 7014 + }, + { + "epoch": 3.1799637352674526, + "grad_norm": 0.25532128966778467, + "learning_rate": 2.8172645414696423e-05, + "loss": 0.8616, + "step": 7015 + }, + { + "epoch": 3.1804170444242974, + "grad_norm": 0.3210163340925334, + "learning_rate": 2.81605532540588e-05, + "loss": 0.8826, + "step": 7016 + }, + { + "epoch": 3.1808703535811422, + "grad_norm": 0.207948701120001, + "learning_rate": 2.814846227917656e-05, + "loss": 0.8593, + "step": 7017 + }, + { + "epoch": 3.1813236627379875, + "grad_norm": 0.27138805537596256, + "learning_rate": 2.8136372491260648e-05, + "loss": 0.8767, + "step": 7018 + }, + { + "epoch": 3.1817769718948323, + "grad_norm": 0.29438984433656523, + "learning_rate": 2.8124283891521904e-05, + "loss": 0.8784, + "step": 7019 + }, + { + "epoch": 3.182230281051677, + "grad_norm": 0.20813375758286856, + "learning_rate": 2.811219648117103e-05, + "loss": 0.8679, + "step": 7020 + }, + { + "epoch": 3.182683590208522, + "grad_norm": 0.3227121904132127, + "learning_rate": 2.8100110261418606e-05, + "loss": 0.8637, + "step": 7021 + }, + { + "epoch": 3.1831368993653673, + "grad_norm": 0.2580579981013693, + "learning_rate": 2.8088025233475123e-05, + "loss": 0.882, + "step": 7022 + }, + { + "epoch": 3.183590208522212, + "grad_norm": 0.2778074580435852, + "learning_rate": 2.807594139855093e-05, + "loss": 0.8486, + "step": 7023 + }, + { + "epoch": 3.184043517679057, + "grad_norm": 0.2851299918406578, + "learning_rate": 2.8063858757856234e-05, + "loss": 0.8724, + "step": 7024 + }, + { + "epoch": 3.184496826835902, + "grad_norm": 0.21771659031868298, + "learning_rate": 2.805177731260117e-05, + "loss": 0.8584, + "step": 7025 + }, + { + "epoch": 3.184950135992747, + "grad_norm": 0.2859556541102524, + "learning_rate": 2.8039697063995725e-05, + "loss": 0.8733, + "step": 7026 + }, + { + "epoch": 3.185403445149592, + "grad_norm": 0.2567365619659707, + "learning_rate": 2.802761801324978e-05, + "loss": 0.8596, + "step": 7027 + }, + { + "epoch": 3.185856754306437, + "grad_norm": 0.1919800119592503, + "learning_rate": 2.8015540161573062e-05, + "loss": 0.8498, + "step": 7028 + }, + { + "epoch": 3.186310063463282, + "grad_norm": 0.28941898409851513, + "learning_rate": 2.800346351017522e-05, + "loss": 0.8561, + "step": 7029 + }, + { + "epoch": 3.1867633726201268, + "grad_norm": 0.22949077125713613, + "learning_rate": 2.7991388060265766e-05, + "loss": 0.8917, + "step": 7030 + }, + { + "epoch": 3.187216681776972, + "grad_norm": 0.23988505864439696, + "learning_rate": 2.797931381305409e-05, + "loss": 0.861, + "step": 7031 + }, + { + "epoch": 3.187669990933817, + "grad_norm": 0.22613308710109137, + "learning_rate": 2.796724076974946e-05, + "loss": 0.8715, + "step": 7032 + }, + { + "epoch": 3.1881233000906617, + "grad_norm": 0.22539984513030858, + "learning_rate": 2.7955168931561034e-05, + "loss": 0.8598, + "step": 7033 + }, + { + "epoch": 3.188576609247507, + "grad_norm": 0.2151448111852147, + "learning_rate": 2.794309829969784e-05, + "loss": 0.8736, + "step": 7034 + }, + { + "epoch": 3.189029918404352, + "grad_norm": 0.22673186729104428, + "learning_rate": 2.7931028875368782e-05, + "loss": 0.8707, + "step": 7035 + }, + { + "epoch": 3.1894832275611966, + "grad_norm": 0.2250134767305316, + "learning_rate": 2.7918960659782652e-05, + "loss": 0.8777, + "step": 7036 + }, + { + "epoch": 3.189936536718042, + "grad_norm": 0.24361982382225164, + "learning_rate": 2.790689365414812e-05, + "loss": 0.8814, + "step": 7037 + }, + { + "epoch": 3.1903898458748867, + "grad_norm": 0.26427002660125076, + "learning_rate": 2.7894827859673738e-05, + "loss": 0.8862, + "step": 7038 + }, + { + "epoch": 3.1908431550317315, + "grad_norm": 0.2570473797302161, + "learning_rate": 2.7882763277567915e-05, + "loss": 0.8695, + "step": 7039 + }, + { + "epoch": 3.191296464188577, + "grad_norm": 0.25718185074528666, + "learning_rate": 2.787069990903896e-05, + "loss": 0.8848, + "step": 7040 + }, + { + "epoch": 3.1917497733454216, + "grad_norm": 0.25468066256810284, + "learning_rate": 2.7858637755295057e-05, + "loss": 0.8619, + "step": 7041 + }, + { + "epoch": 3.1922030825022665, + "grad_norm": 0.20185331950228208, + "learning_rate": 2.784657681754426e-05, + "loss": 0.8606, + "step": 7042 + }, + { + "epoch": 3.1926563916591117, + "grad_norm": 0.22031907612490312, + "learning_rate": 2.7834517096994518e-05, + "loss": 0.8873, + "step": 7043 + }, + { + "epoch": 3.1931097008159566, + "grad_norm": 0.2545507893105532, + "learning_rate": 2.7822458594853646e-05, + "loss": 0.8886, + "step": 7044 + }, + { + "epoch": 3.1935630099728014, + "grad_norm": 0.2388702367441169, + "learning_rate": 2.7810401312329332e-05, + "loss": 0.8609, + "step": 7045 + }, + { + "epoch": 3.1940163191296462, + "grad_norm": 0.20793644433831554, + "learning_rate": 2.7798345250629162e-05, + "loss": 0.8677, + "step": 7046 + }, + { + "epoch": 3.1944696282864915, + "grad_norm": 0.20971409853771064, + "learning_rate": 2.7786290410960567e-05, + "loss": 0.8898, + "step": 7047 + }, + { + "epoch": 3.1949229374433363, + "grad_norm": 0.23601012708070127, + "learning_rate": 2.7774236794530875e-05, + "loss": 0.8869, + "step": 7048 + }, + { + "epoch": 3.195376246600181, + "grad_norm": 0.22272085315299947, + "learning_rate": 2.776218440254731e-05, + "loss": 0.8773, + "step": 7049 + }, + { + "epoch": 3.1958295557570264, + "grad_norm": 0.2385071964154399, + "learning_rate": 2.775013323621693e-05, + "loss": 0.8607, + "step": 7050 + }, + { + "epoch": 3.1962828649138713, + "grad_norm": 0.22617826734757215, + "learning_rate": 2.7738083296746715e-05, + "loss": 0.8768, + "step": 7051 + }, + { + "epoch": 3.196736174070716, + "grad_norm": 0.24289829225010073, + "learning_rate": 2.7726034585343495e-05, + "loss": 0.8628, + "step": 7052 + }, + { + "epoch": 3.1971894832275614, + "grad_norm": 0.22609194035250577, + "learning_rate": 2.7713987103213994e-05, + "loss": 0.877, + "step": 7053 + }, + { + "epoch": 3.197642792384406, + "grad_norm": 0.2069374358548237, + "learning_rate": 2.7701940851564776e-05, + "loss": 0.885, + "step": 7054 + }, + { + "epoch": 3.198096101541251, + "grad_norm": 0.20223546282834698, + "learning_rate": 2.7689895831602335e-05, + "loss": 0.8567, + "step": 7055 + }, + { + "epoch": 3.1985494106980963, + "grad_norm": 0.20975401501733387, + "learning_rate": 2.7677852044532998e-05, + "loss": 0.8736, + "step": 7056 + }, + { + "epoch": 3.199002719854941, + "grad_norm": 0.17877963739675004, + "learning_rate": 2.7665809491562998e-05, + "loss": 0.8683, + "step": 7057 + }, + { + "epoch": 3.199456029011786, + "grad_norm": 0.1879526883847707, + "learning_rate": 2.7653768173898424e-05, + "loss": 0.8583, + "step": 7058 + }, + { + "epoch": 3.199909338168631, + "grad_norm": 0.19414071052424592, + "learning_rate": 2.7641728092745252e-05, + "loss": 0.8472, + "step": 7059 + }, + { + "epoch": 3.200362647325476, + "grad_norm": 0.19544481430732857, + "learning_rate": 2.7629689249309336e-05, + "loss": 0.8743, + "step": 7060 + }, + { + "epoch": 3.200815956482321, + "grad_norm": 0.23621332892840305, + "learning_rate": 2.7617651644796395e-05, + "loss": 0.8827, + "step": 7061 + }, + { + "epoch": 3.2012692656391657, + "grad_norm": 0.19546370977419714, + "learning_rate": 2.760561528041203e-05, + "loss": 0.8653, + "step": 7062 + }, + { + "epoch": 3.201722574796011, + "grad_norm": 0.18579385596291215, + "learning_rate": 2.7593580157361725e-05, + "loss": 0.8549, + "step": 7063 + }, + { + "epoch": 3.202175883952856, + "grad_norm": 0.26885441272519545, + "learning_rate": 2.758154627685083e-05, + "loss": 0.8573, + "step": 7064 + }, + { + "epoch": 3.2026291931097006, + "grad_norm": 0.26091255619884696, + "learning_rate": 2.7569513640084583e-05, + "loss": 0.8367, + "step": 7065 + }, + { + "epoch": 3.203082502266546, + "grad_norm": 0.21172365081578495, + "learning_rate": 2.755748224826806e-05, + "loss": 0.8627, + "step": 7066 + }, + { + "epoch": 3.2035358114233907, + "grad_norm": 0.1751487372063716, + "learning_rate": 2.7545452102606262e-05, + "loss": 0.8732, + "step": 7067 + }, + { + "epoch": 3.2039891205802356, + "grad_norm": 0.22509314866384822, + "learning_rate": 2.753342320430404e-05, + "loss": 0.8728, + "step": 7068 + }, + { + "epoch": 3.204442429737081, + "grad_norm": 0.30118513214935766, + "learning_rate": 2.752139555456613e-05, + "loss": 0.8686, + "step": 7069 + }, + { + "epoch": 3.2048957388939256, + "grad_norm": 0.28159331106267915, + "learning_rate": 2.750936915459712e-05, + "loss": 0.855, + "step": 7070 + }, + { + "epoch": 3.2053490480507705, + "grad_norm": 0.2597931143419873, + "learning_rate": 2.74973440056015e-05, + "loss": 0.8514, + "step": 7071 + }, + { + "epoch": 3.2058023572076157, + "grad_norm": 0.21359942295248302, + "learning_rate": 2.7485320108783626e-05, + "loss": 0.8553, + "step": 7072 + }, + { + "epoch": 3.2062556663644606, + "grad_norm": 0.23610742141066066, + "learning_rate": 2.7473297465347716e-05, + "loss": 0.8543, + "step": 7073 + }, + { + "epoch": 3.2067089755213054, + "grad_norm": 0.2893811985593424, + "learning_rate": 2.7461276076497882e-05, + "loss": 0.8515, + "step": 7074 + }, + { + "epoch": 3.2071622846781507, + "grad_norm": 0.3248646277597937, + "learning_rate": 2.7449255943438102e-05, + "loss": 0.8894, + "step": 7075 + }, + { + "epoch": 3.2076155938349955, + "grad_norm": 0.24170282263243223, + "learning_rate": 2.7437237067372225e-05, + "loss": 0.8576, + "step": 7076 + }, + { + "epoch": 3.2080689029918403, + "grad_norm": 0.18407331015871886, + "learning_rate": 2.742521944950397e-05, + "loss": 0.8671, + "step": 7077 + }, + { + "epoch": 3.2085222121486856, + "grad_norm": 0.19414469844510868, + "learning_rate": 2.741320309103695e-05, + "loss": 0.8746, + "step": 7078 + }, + { + "epoch": 3.2089755213055304, + "grad_norm": 0.2473811281580896, + "learning_rate": 2.7401187993174644e-05, + "loss": 0.8976, + "step": 7079 + }, + { + "epoch": 3.2094288304623753, + "grad_norm": 0.2488037555027017, + "learning_rate": 2.7389174157120374e-05, + "loss": 0.874, + "step": 7080 + }, + { + "epoch": 3.2098821396192205, + "grad_norm": 0.24246962686473514, + "learning_rate": 2.7377161584077375e-05, + "loss": 0.8809, + "step": 7081 + }, + { + "epoch": 3.2103354487760654, + "grad_norm": 0.2120381916257018, + "learning_rate": 2.7365150275248743e-05, + "loss": 0.8472, + "step": 7082 + }, + { + "epoch": 3.21078875793291, + "grad_norm": 0.24752629721607464, + "learning_rate": 2.7353140231837448e-05, + "loss": 0.8674, + "step": 7083 + }, + { + "epoch": 3.211242067089755, + "grad_norm": 0.2988834764464828, + "learning_rate": 2.734113145504632e-05, + "loss": 0.8649, + "step": 7084 + }, + { + "epoch": 3.2116953762466003, + "grad_norm": 0.34368497014264876, + "learning_rate": 2.73291239460781e-05, + "loss": 0.848, + "step": 7085 + }, + { + "epoch": 3.212148685403445, + "grad_norm": 0.20917645152831016, + "learning_rate": 2.7317117706135338e-05, + "loss": 0.8706, + "step": 7086 + }, + { + "epoch": 3.21260199456029, + "grad_norm": 0.2320938759092581, + "learning_rate": 2.7305112736420514e-05, + "loss": 0.8619, + "step": 7087 + }, + { + "epoch": 3.213055303717135, + "grad_norm": 0.28479379728805876, + "learning_rate": 2.7293109038135957e-05, + "loss": 0.8738, + "step": 7088 + }, + { + "epoch": 3.21350861287398, + "grad_norm": 0.2332242106505845, + "learning_rate": 2.7281106612483878e-05, + "loss": 0.8611, + "step": 7089 + }, + { + "epoch": 3.213961922030825, + "grad_norm": 0.2566764233881585, + "learning_rate": 2.726910546066635e-05, + "loss": 0.875, + "step": 7090 + }, + { + "epoch": 3.21441523118767, + "grad_norm": 0.2179824305479911, + "learning_rate": 2.7257105583885324e-05, + "loss": 0.8637, + "step": 7091 + }, + { + "epoch": 3.214868540344515, + "grad_norm": 0.29221856109367905, + "learning_rate": 2.7245106983342626e-05, + "loss": 0.8638, + "step": 7092 + }, + { + "epoch": 3.21532184950136, + "grad_norm": 0.35866560272387454, + "learning_rate": 2.723310966023995e-05, + "loss": 0.8841, + "step": 7093 + }, + { + "epoch": 3.215775158658205, + "grad_norm": 0.3235438664676268, + "learning_rate": 2.722111361577886e-05, + "loss": 0.8604, + "step": 7094 + }, + { + "epoch": 3.21622846781505, + "grad_norm": 0.270938992676611, + "learning_rate": 2.7209118851160813e-05, + "loss": 0.8748, + "step": 7095 + }, + { + "epoch": 3.2166817769718947, + "grad_norm": 0.19919365792742125, + "learning_rate": 2.7197125367587087e-05, + "loss": 0.8821, + "step": 7096 + }, + { + "epoch": 3.21713508612874, + "grad_norm": 0.30125264238415195, + "learning_rate": 2.718513316625889e-05, + "loss": 0.8737, + "step": 7097 + }, + { + "epoch": 3.217588395285585, + "grad_norm": 0.3764213625784563, + "learning_rate": 2.7173142248377264e-05, + "loss": 0.869, + "step": 7098 + }, + { + "epoch": 3.2180417044424297, + "grad_norm": 0.29590058378547485, + "learning_rate": 2.7161152615143138e-05, + "loss": 0.8771, + "step": 7099 + }, + { + "epoch": 3.2184950135992745, + "grad_norm": 0.20479617623226587, + "learning_rate": 2.7149164267757315e-05, + "loss": 0.8539, + "step": 7100 + }, + { + "epoch": 3.2189483227561198, + "grad_norm": 0.3535006208266525, + "learning_rate": 2.7137177207420458e-05, + "loss": 0.8911, + "step": 7101 + }, + { + "epoch": 3.2194016319129646, + "grad_norm": 0.4255060555756457, + "learning_rate": 2.7125191435333103e-05, + "loss": 0.8741, + "step": 7102 + }, + { + "epoch": 3.2198549410698094, + "grad_norm": 0.36524203361390983, + "learning_rate": 2.711320695269567e-05, + "loss": 0.8643, + "step": 7103 + }, + { + "epoch": 3.2203082502266547, + "grad_norm": 0.26046864492159105, + "learning_rate": 2.7101223760708442e-05, + "loss": 0.871, + "step": 7104 + }, + { + "epoch": 3.2207615593834995, + "grad_norm": 0.21330646450341276, + "learning_rate": 2.7089241860571553e-05, + "loss": 0.8702, + "step": 7105 + }, + { + "epoch": 3.2212148685403443, + "grad_norm": 0.3220936461782329, + "learning_rate": 2.7077261253485034e-05, + "loss": 0.8654, + "step": 7106 + }, + { + "epoch": 3.2216681776971896, + "grad_norm": 0.3677154370281295, + "learning_rate": 2.706528194064878e-05, + "loss": 0.8594, + "step": 7107 + }, + { + "epoch": 3.2221214868540344, + "grad_norm": 0.28743196616696554, + "learning_rate": 2.7053303923262557e-05, + "loss": 0.8598, + "step": 7108 + }, + { + "epoch": 3.2225747960108793, + "grad_norm": 0.23276900236703654, + "learning_rate": 2.7041327202525996e-05, + "loss": 0.8532, + "step": 7109 + }, + { + "epoch": 3.2230281051677245, + "grad_norm": 0.2869965693119986, + "learning_rate": 2.70293517796386e-05, + "loss": 0.8693, + "step": 7110 + }, + { + "epoch": 3.2234814143245694, + "grad_norm": 0.33850937157955635, + "learning_rate": 2.7017377655799742e-05, + "loss": 0.8671, + "step": 7111 + }, + { + "epoch": 3.223934723481414, + "grad_norm": 0.46722737811015325, + "learning_rate": 2.7005404832208663e-05, + "loss": 0.8664, + "step": 7112 + }, + { + "epoch": 3.2243880326382595, + "grad_norm": 0.39580335601589706, + "learning_rate": 2.699343331006449e-05, + "loss": 0.8858, + "step": 7113 + }, + { + "epoch": 3.2248413417951043, + "grad_norm": 0.21555827417178836, + "learning_rate": 2.6981463090566187e-05, + "loss": 0.8859, + "step": 7114 + }, + { + "epoch": 3.225294650951949, + "grad_norm": 0.2569025978584129, + "learning_rate": 2.696949417491262e-05, + "loss": 0.8834, + "step": 7115 + }, + { + "epoch": 3.2257479601087944, + "grad_norm": 0.3401529856480801, + "learning_rate": 2.6957526564302506e-05, + "loss": 0.8673, + "step": 7116 + }, + { + "epoch": 3.226201269265639, + "grad_norm": 0.35108204529770104, + "learning_rate": 2.6945560259934437e-05, + "loss": 0.8761, + "step": 7117 + }, + { + "epoch": 3.226654578422484, + "grad_norm": 0.30423180616152007, + "learning_rate": 2.6933595263006876e-05, + "loss": 0.8677, + "step": 7118 + }, + { + "epoch": 3.2271078875793293, + "grad_norm": 0.24179504866158522, + "learning_rate": 2.692163157471815e-05, + "loss": 0.8651, + "step": 7119 + }, + { + "epoch": 3.227561196736174, + "grad_norm": 0.18829583177857484, + "learning_rate": 2.6909669196266458e-05, + "loss": 0.8741, + "step": 7120 + }, + { + "epoch": 3.228014505893019, + "grad_norm": 0.271423702315048, + "learning_rate": 2.6897708128849876e-05, + "loss": 0.8603, + "step": 7121 + }, + { + "epoch": 3.2284678150498642, + "grad_norm": 0.25424080401716254, + "learning_rate": 2.688574837366633e-05, + "loss": 0.8817, + "step": 7122 + }, + { + "epoch": 3.228921124206709, + "grad_norm": 0.23272947749440964, + "learning_rate": 2.6873789931913624e-05, + "loss": 0.8606, + "step": 7123 + }, + { + "epoch": 3.229374433363554, + "grad_norm": 0.18650860545021472, + "learning_rate": 2.686183280478943e-05, + "loss": 0.8471, + "step": 7124 + }, + { + "epoch": 3.2298277425203987, + "grad_norm": 0.216730076695262, + "learning_rate": 2.6849876993491303e-05, + "loss": 0.861, + "step": 7125 + }, + { + "epoch": 3.230281051677244, + "grad_norm": 0.22202526299806077, + "learning_rate": 2.6837922499216634e-05, + "loss": 0.8698, + "step": 7126 + }, + { + "epoch": 3.230734360834089, + "grad_norm": 0.21030005513615974, + "learning_rate": 2.682596932316271e-05, + "loss": 0.8875, + "step": 7127 + }, + { + "epoch": 3.2311876699909337, + "grad_norm": 0.21525328437695784, + "learning_rate": 2.6814017466526673e-05, + "loss": 0.8688, + "step": 7128 + }, + { + "epoch": 3.231640979147779, + "grad_norm": 0.21869620490153482, + "learning_rate": 2.6802066930505543e-05, + "loss": 0.8834, + "step": 7129 + }, + { + "epoch": 3.2320942883046238, + "grad_norm": 0.1715589308237483, + "learning_rate": 2.6790117716296193e-05, + "loss": 0.8543, + "step": 7130 + }, + { + "epoch": 3.2325475974614686, + "grad_norm": 0.23488599975971247, + "learning_rate": 2.677816982509538e-05, + "loss": 0.8758, + "step": 7131 + }, + { + "epoch": 3.233000906618314, + "grad_norm": 0.2842257436237308, + "learning_rate": 2.6766223258099712e-05, + "loss": 0.8702, + "step": 7132 + }, + { + "epoch": 3.2334542157751587, + "grad_norm": 0.2886678403557759, + "learning_rate": 2.6754278016505683e-05, + "loss": 0.8658, + "step": 7133 + }, + { + "epoch": 3.2339075249320035, + "grad_norm": 0.18978194652109148, + "learning_rate": 2.674233410150964e-05, + "loss": 0.8615, + "step": 7134 + }, + { + "epoch": 3.234360834088849, + "grad_norm": 0.3171838617314516, + "learning_rate": 2.6730391514307804e-05, + "loss": 0.854, + "step": 7135 + }, + { + "epoch": 3.2348141432456936, + "grad_norm": 0.2246730078655768, + "learning_rate": 2.671845025609626e-05, + "loss": 0.8517, + "step": 7136 + }, + { + "epoch": 3.2352674524025384, + "grad_norm": 0.2979983212519794, + "learning_rate": 2.6706510328070944e-05, + "loss": 0.8869, + "step": 7137 + }, + { + "epoch": 3.2357207615593837, + "grad_norm": 0.2538337882814677, + "learning_rate": 2.6694571731427693e-05, + "loss": 0.8769, + "step": 7138 + }, + { + "epoch": 3.2361740707162285, + "grad_norm": 0.21841288757666819, + "learning_rate": 2.6682634467362186e-05, + "loss": 0.875, + "step": 7139 + }, + { + "epoch": 3.2366273798730734, + "grad_norm": 0.2664456084303657, + "learning_rate": 2.6670698537069975e-05, + "loss": 0.87, + "step": 7140 + }, + { + "epoch": 3.237080689029918, + "grad_norm": 0.2157659243014918, + "learning_rate": 2.6658763941746486e-05, + "loss": 0.8742, + "step": 7141 + }, + { + "epoch": 3.2375339981867635, + "grad_norm": 0.2775512166239588, + "learning_rate": 2.6646830682587005e-05, + "loss": 0.8566, + "step": 7142 + }, + { + "epoch": 3.2379873073436083, + "grad_norm": 0.28042003102580204, + "learning_rate": 2.663489876078666e-05, + "loss": 0.8865, + "step": 7143 + }, + { + "epoch": 3.238440616500453, + "grad_norm": 0.21903260680340034, + "learning_rate": 2.6622968177540493e-05, + "loss": 0.8585, + "step": 7144 + }, + { + "epoch": 3.2388939256572984, + "grad_norm": 0.2153557837704783, + "learning_rate": 2.6611038934043367e-05, + "loss": 0.8513, + "step": 7145 + }, + { + "epoch": 3.239347234814143, + "grad_norm": 0.2464068369116426, + "learning_rate": 2.6599111031490043e-05, + "loss": 0.8858, + "step": 7146 + }, + { + "epoch": 3.239800543970988, + "grad_norm": 0.22527666927502402, + "learning_rate": 2.6587184471075132e-05, + "loss": 0.8672, + "step": 7147 + }, + { + "epoch": 3.2402538531278333, + "grad_norm": 0.23518422738049147, + "learning_rate": 2.6575259253993123e-05, + "loss": 0.8731, + "step": 7148 + }, + { + "epoch": 3.240707162284678, + "grad_norm": 0.2917012366323951, + "learning_rate": 2.656333538143835e-05, + "loss": 0.8731, + "step": 7149 + }, + { + "epoch": 3.241160471441523, + "grad_norm": 0.3110784126839292, + "learning_rate": 2.655141285460503e-05, + "loss": 0.8569, + "step": 7150 + }, + { + "epoch": 3.2416137805983682, + "grad_norm": 0.21033588192070124, + "learning_rate": 2.6539491674687243e-05, + "loss": 0.8627, + "step": 7151 + }, + { + "epoch": 3.242067089755213, + "grad_norm": 0.2746509516116878, + "learning_rate": 2.6527571842878915e-05, + "loss": 0.8782, + "step": 7152 + }, + { + "epoch": 3.242520398912058, + "grad_norm": 0.28512839289229863, + "learning_rate": 2.6515653360373863e-05, + "loss": 0.8742, + "step": 7153 + }, + { + "epoch": 3.242973708068903, + "grad_norm": 0.1669809324254879, + "learning_rate": 2.6503736228365758e-05, + "loss": 0.8707, + "step": 7154 + }, + { + "epoch": 3.243427017225748, + "grad_norm": 0.2679716899524469, + "learning_rate": 2.6491820448048134e-05, + "loss": 0.8496, + "step": 7155 + }, + { + "epoch": 3.243880326382593, + "grad_norm": 0.23287084214962503, + "learning_rate": 2.6479906020614393e-05, + "loss": 0.8728, + "step": 7156 + }, + { + "epoch": 3.244333635539438, + "grad_norm": 0.2341775333754215, + "learning_rate": 2.64679929472578e-05, + "loss": 0.8731, + "step": 7157 + }, + { + "epoch": 3.244786944696283, + "grad_norm": 0.2471120434394026, + "learning_rate": 2.645608122917148e-05, + "loss": 0.872, + "step": 7158 + }, + { + "epoch": 3.2452402538531278, + "grad_norm": 0.19723065475617033, + "learning_rate": 2.6444170867548432e-05, + "loss": 0.8802, + "step": 7159 + }, + { + "epoch": 3.245693563009973, + "grad_norm": 0.2701897105253945, + "learning_rate": 2.643226186358151e-05, + "loss": 0.859, + "step": 7160 + }, + { + "epoch": 3.246146872166818, + "grad_norm": 0.2489064847815917, + "learning_rate": 2.6420354218463454e-05, + "loss": 0.8557, + "step": 7161 + }, + { + "epoch": 3.2466001813236627, + "grad_norm": 0.22401903809847865, + "learning_rate": 2.6408447933386816e-05, + "loss": 0.8638, + "step": 7162 + }, + { + "epoch": 3.2470534904805075, + "grad_norm": 0.2629862731824095, + "learning_rate": 2.6396543009544057e-05, + "loss": 0.8717, + "step": 7163 + }, + { + "epoch": 3.247506799637353, + "grad_norm": 0.20764463310118195, + "learning_rate": 2.638463944812751e-05, + "loss": 0.8591, + "step": 7164 + }, + { + "epoch": 3.2479601087941976, + "grad_norm": 0.2336174555147234, + "learning_rate": 2.6372737250329322e-05, + "loss": 0.8582, + "step": 7165 + }, + { + "epoch": 3.2484134179510424, + "grad_norm": 0.24144168804807636, + "learning_rate": 2.6360836417341555e-05, + "loss": 0.8955, + "step": 7166 + }, + { + "epoch": 3.2488667271078877, + "grad_norm": 0.2532804038784487, + "learning_rate": 2.6348936950356113e-05, + "loss": 0.8795, + "step": 7167 + }, + { + "epoch": 3.2493200362647325, + "grad_norm": 0.24524069419826103, + "learning_rate": 2.6337038850564745e-05, + "loss": 0.864, + "step": 7168 + }, + { + "epoch": 3.2497733454215774, + "grad_norm": 0.24328112106480598, + "learning_rate": 2.6325142119159095e-05, + "loss": 0.866, + "step": 7169 + }, + { + "epoch": 3.2502266545784226, + "grad_norm": 0.24265653351342886, + "learning_rate": 2.6313246757330647e-05, + "loss": 0.8624, + "step": 7170 + }, + { + "epoch": 3.2506799637352675, + "grad_norm": 0.1905022866887128, + "learning_rate": 2.6301352766270762e-05, + "loss": 0.8549, + "step": 7171 + }, + { + "epoch": 3.2511332728921123, + "grad_norm": 0.26395040765620753, + "learning_rate": 2.628946014717066e-05, + "loss": 0.864, + "step": 7172 + }, + { + "epoch": 3.2515865820489576, + "grad_norm": 0.22660274638416356, + "learning_rate": 2.627756890122142e-05, + "loss": 0.8798, + "step": 7173 + }, + { + "epoch": 3.2520398912058024, + "grad_norm": 0.2696303704265977, + "learning_rate": 2.626567902961398e-05, + "loss": 0.8532, + "step": 7174 + }, + { + "epoch": 3.252493200362647, + "grad_norm": 0.2446973180356918, + "learning_rate": 2.625379053353915e-05, + "loss": 0.8484, + "step": 7175 + }, + { + "epoch": 3.252946509519492, + "grad_norm": 0.2320072338316388, + "learning_rate": 2.6241903414187597e-05, + "loss": 0.8696, + "step": 7176 + }, + { + "epoch": 3.2533998186763373, + "grad_norm": 0.3160514928618922, + "learning_rate": 2.6230017672749865e-05, + "loss": 0.8709, + "step": 7177 + }, + { + "epoch": 3.253853127833182, + "grad_norm": 0.22820551602272077, + "learning_rate": 2.6218133310416317e-05, + "loss": 0.8691, + "step": 7178 + }, + { + "epoch": 3.254306436990027, + "grad_norm": 0.2536475280344736, + "learning_rate": 2.6206250328377225e-05, + "loss": 0.8463, + "step": 7179 + }, + { + "epoch": 3.2547597461468722, + "grad_norm": 0.29053207278883414, + "learning_rate": 2.61943687278227e-05, + "loss": 0.8626, + "step": 7180 + }, + { + "epoch": 3.255213055303717, + "grad_norm": 0.25339554608094167, + "learning_rate": 2.618248850994273e-05, + "loss": 0.8728, + "step": 7181 + }, + { + "epoch": 3.255666364460562, + "grad_norm": 0.23716308528287977, + "learning_rate": 2.6170609675927146e-05, + "loss": 0.8768, + "step": 7182 + }, + { + "epoch": 3.256119673617407, + "grad_norm": 0.21996537702763674, + "learning_rate": 2.6158732226965638e-05, + "loss": 0.8583, + "step": 7183 + }, + { + "epoch": 3.256572982774252, + "grad_norm": 0.2298211011603112, + "learning_rate": 2.6146856164247778e-05, + "loss": 0.8677, + "step": 7184 + }, + { + "epoch": 3.257026291931097, + "grad_norm": 0.21336209059841588, + "learning_rate": 2.613498148896298e-05, + "loss": 0.8446, + "step": 7185 + }, + { + "epoch": 3.257479601087942, + "grad_norm": 0.24929038618495863, + "learning_rate": 2.6123108202300537e-05, + "loss": 0.8592, + "step": 7186 + }, + { + "epoch": 3.257932910244787, + "grad_norm": 0.20481647712834392, + "learning_rate": 2.6111236305449598e-05, + "loss": 0.8567, + "step": 7187 + }, + { + "epoch": 3.2583862194016318, + "grad_norm": 0.26356378901967087, + "learning_rate": 2.6099365799599153e-05, + "loss": 0.868, + "step": 7188 + }, + { + "epoch": 3.258839528558477, + "grad_norm": 0.21171343151573332, + "learning_rate": 2.6087496685938085e-05, + "loss": 0.8646, + "step": 7189 + }, + { + "epoch": 3.259292837715322, + "grad_norm": 0.2195410419152678, + "learning_rate": 2.6075628965655105e-05, + "loss": 0.8598, + "step": 7190 + }, + { + "epoch": 3.2597461468721667, + "grad_norm": 0.23217392804223735, + "learning_rate": 2.6063762639938808e-05, + "loss": 0.8694, + "step": 7191 + }, + { + "epoch": 3.260199456029012, + "grad_norm": 0.21836930876455946, + "learning_rate": 2.6051897709977638e-05, + "loss": 0.8669, + "step": 7192 + }, + { + "epoch": 3.260652765185857, + "grad_norm": 0.1881409107165801, + "learning_rate": 2.604003417695992e-05, + "loss": 0.8594, + "step": 7193 + }, + { + "epoch": 3.2611060743427016, + "grad_norm": 0.23980519659286378, + "learning_rate": 2.6028172042073793e-05, + "loss": 0.8745, + "step": 7194 + }, + { + "epoch": 3.261559383499547, + "grad_norm": 0.17937022824968818, + "learning_rate": 2.6016311306507302e-05, + "loss": 0.8556, + "step": 7195 + }, + { + "epoch": 3.2620126926563917, + "grad_norm": 0.25196923471507576, + "learning_rate": 2.600445197144833e-05, + "loss": 0.8634, + "step": 7196 + }, + { + "epoch": 3.2624660018132365, + "grad_norm": 0.2129680476270928, + "learning_rate": 2.5992594038084623e-05, + "loss": 0.8596, + "step": 7197 + }, + { + "epoch": 3.262919310970082, + "grad_norm": 0.23806699989488433, + "learning_rate": 2.5980737507603794e-05, + "loss": 0.8548, + "step": 7198 + }, + { + "epoch": 3.2633726201269266, + "grad_norm": 0.25569837523023115, + "learning_rate": 2.5968882381193306e-05, + "loss": 0.8601, + "step": 7199 + }, + { + "epoch": 3.2638259292837715, + "grad_norm": 0.2481012383341175, + "learning_rate": 2.5957028660040492e-05, + "loss": 0.8685, + "step": 7200 + }, + { + "epoch": 3.2642792384406167, + "grad_norm": 0.20935467377760117, + "learning_rate": 2.594517634533252e-05, + "loss": 0.8545, + "step": 7201 + }, + { + "epoch": 3.2647325475974616, + "grad_norm": 0.28044167080839527, + "learning_rate": 2.5933325438256446e-05, + "loss": 0.8636, + "step": 7202 + }, + { + "epoch": 3.2651858567543064, + "grad_norm": 0.600172375743633, + "learning_rate": 2.5921475939999164e-05, + "loss": 0.9047, + "step": 7203 + }, + { + "epoch": 3.265639165911151, + "grad_norm": 0.21991784957649657, + "learning_rate": 2.5909627851747442e-05, + "loss": 0.8937, + "step": 7204 + }, + { + "epoch": 3.2660924750679965, + "grad_norm": 0.2578323361758706, + "learning_rate": 2.5897781174687907e-05, + "loss": 0.8726, + "step": 7205 + }, + { + "epoch": 3.2665457842248413, + "grad_norm": 0.24146855932270045, + "learning_rate": 2.5885935910007033e-05, + "loss": 0.8737, + "step": 7206 + }, + { + "epoch": 3.266999093381686, + "grad_norm": 0.21069116564541093, + "learning_rate": 2.5874092058891157e-05, + "loss": 0.8606, + "step": 7207 + }, + { + "epoch": 3.2674524025385314, + "grad_norm": 0.20846931999176452, + "learning_rate": 2.586224962252648e-05, + "loss": 0.8974, + "step": 7208 + }, + { + "epoch": 3.2679057116953762, + "grad_norm": 0.2222198350288991, + "learning_rate": 2.585040860209905e-05, + "loss": 0.8624, + "step": 7209 + }, + { + "epoch": 3.268359020852221, + "grad_norm": 0.2145151588098671, + "learning_rate": 2.5838568998794783e-05, + "loss": 0.862, + "step": 7210 + }, + { + "epoch": 3.2688123300090663, + "grad_norm": 0.18771233216790345, + "learning_rate": 2.582673081379945e-05, + "loss": 0.8685, + "step": 7211 + }, + { + "epoch": 3.269265639165911, + "grad_norm": 0.22766542890238503, + "learning_rate": 2.581489404829868e-05, + "loss": 0.8796, + "step": 7212 + }, + { + "epoch": 3.269718948322756, + "grad_norm": 0.22297189975469184, + "learning_rate": 2.5803058703477958e-05, + "loss": 0.872, + "step": 7213 + }, + { + "epoch": 3.2701722574796013, + "grad_norm": 0.2016709156657167, + "learning_rate": 2.5791224780522636e-05, + "loss": 0.8659, + "step": 7214 + }, + { + "epoch": 3.270625566636446, + "grad_norm": 0.33994716063905617, + "learning_rate": 2.5779392280617906e-05, + "loss": 0.8711, + "step": 7215 + }, + { + "epoch": 3.271078875793291, + "grad_norm": 0.27385821937610466, + "learning_rate": 2.5767561204948838e-05, + "loss": 0.8688, + "step": 7216 + }, + { + "epoch": 3.2715321849501358, + "grad_norm": 0.22521483392498173, + "learning_rate": 2.575573155470034e-05, + "loss": 0.8596, + "step": 7217 + }, + { + "epoch": 3.271985494106981, + "grad_norm": 0.18021793682404055, + "learning_rate": 2.574390333105719e-05, + "loss": 0.869, + "step": 7218 + }, + { + "epoch": 3.272438803263826, + "grad_norm": 0.19525720550399556, + "learning_rate": 2.5732076535204037e-05, + "loss": 0.8676, + "step": 7219 + }, + { + "epoch": 3.2728921124206707, + "grad_norm": 0.21707520082208814, + "learning_rate": 2.5720251168325333e-05, + "loss": 0.8986, + "step": 7220 + }, + { + "epoch": 3.273345421577516, + "grad_norm": 0.22890842351880158, + "learning_rate": 2.570842723160545e-05, + "loss": 0.856, + "step": 7221 + }, + { + "epoch": 3.273798730734361, + "grad_norm": 0.21221176263638394, + "learning_rate": 2.5696604726228575e-05, + "loss": 0.8578, + "step": 7222 + }, + { + "epoch": 3.2742520398912056, + "grad_norm": 0.205906581263666, + "learning_rate": 2.5684783653378778e-05, + "loss": 0.8527, + "step": 7223 + }, + { + "epoch": 3.274705349048051, + "grad_norm": 0.24643121228032178, + "learning_rate": 2.5672964014239963e-05, + "loss": 0.8491, + "step": 7224 + }, + { + "epoch": 3.2751586582048957, + "grad_norm": 0.2924968080481721, + "learning_rate": 2.5661145809995917e-05, + "loss": 0.8727, + "step": 7225 + }, + { + "epoch": 3.2756119673617405, + "grad_norm": 0.3402381960579194, + "learning_rate": 2.5649329041830252e-05, + "loss": 0.863, + "step": 7226 + }, + { + "epoch": 3.276065276518586, + "grad_norm": 0.2370025060683127, + "learning_rate": 2.5637513710926457e-05, + "loss": 0.8548, + "step": 7227 + }, + { + "epoch": 3.2765185856754306, + "grad_norm": 0.1844643390339872, + "learning_rate": 2.562569981846788e-05, + "loss": 0.8592, + "step": 7228 + }, + { + "epoch": 3.2769718948322755, + "grad_norm": 0.2933355931103168, + "learning_rate": 2.561388736563771e-05, + "loss": 0.875, + "step": 7229 + }, + { + "epoch": 3.2774252039891207, + "grad_norm": 0.3355513759858816, + "learning_rate": 2.5602076353618997e-05, + "loss": 0.8654, + "step": 7230 + }, + { + "epoch": 3.2778785131459656, + "grad_norm": 0.261797833975673, + "learning_rate": 2.559026678359466e-05, + "loss": 0.8648, + "step": 7231 + }, + { + "epoch": 3.2783318223028104, + "grad_norm": 0.2512901180005046, + "learning_rate": 2.557845865674745e-05, + "loss": 0.8732, + "step": 7232 + }, + { + "epoch": 3.2787851314596557, + "grad_norm": 0.36449086470398495, + "learning_rate": 2.5566651974259982e-05, + "loss": 0.8741, + "step": 7233 + }, + { + "epoch": 3.2792384406165005, + "grad_norm": 0.38859708033891127, + "learning_rate": 2.5554846737314754e-05, + "loss": 0.8751, + "step": 7234 + }, + { + "epoch": 3.2796917497733453, + "grad_norm": 0.30003897110653677, + "learning_rate": 2.554304294709407e-05, + "loss": 0.8613, + "step": 7235 + }, + { + "epoch": 3.2801450589301906, + "grad_norm": 0.2432607407965832, + "learning_rate": 2.553124060478012e-05, + "loss": 0.8486, + "step": 7236 + }, + { + "epoch": 3.2805983680870354, + "grad_norm": 0.3121281440126788, + "learning_rate": 2.551943971155495e-05, + "loss": 0.8555, + "step": 7237 + }, + { + "epoch": 3.2810516772438802, + "grad_norm": 0.4166510979605544, + "learning_rate": 2.550764026860046e-05, + "loss": 0.8768, + "step": 7238 + }, + { + "epoch": 3.2815049864007255, + "grad_norm": 0.29355013564666377, + "learning_rate": 2.549584227709838e-05, + "loss": 0.8608, + "step": 7239 + }, + { + "epoch": 3.2819582955575703, + "grad_norm": 0.2463535147767191, + "learning_rate": 2.5484045738230327e-05, + "loss": 0.8518, + "step": 7240 + }, + { + "epoch": 3.282411604714415, + "grad_norm": 0.31758183873785556, + "learning_rate": 2.5472250653177744e-05, + "loss": 0.8679, + "step": 7241 + }, + { + "epoch": 3.2828649138712604, + "grad_norm": 0.327140414028994, + "learning_rate": 2.546045702312196e-05, + "loss": 0.8608, + "step": 7242 + }, + { + "epoch": 3.2833182230281053, + "grad_norm": 0.21536827210763362, + "learning_rate": 2.5448664849244134e-05, + "loss": 0.8562, + "step": 7243 + }, + { + "epoch": 3.28377153218495, + "grad_norm": 0.2793684182335213, + "learning_rate": 2.5436874132725295e-05, + "loss": 0.8739, + "step": 7244 + }, + { + "epoch": 3.284224841341795, + "grad_norm": 0.24302934516477576, + "learning_rate": 2.5425084874746305e-05, + "loss": 0.8827, + "step": 7245 + }, + { + "epoch": 3.28467815049864, + "grad_norm": 0.28997269325863956, + "learning_rate": 2.5413297076487903e-05, + "loss": 0.8665, + "step": 7246 + }, + { + "epoch": 3.285131459655485, + "grad_norm": 0.24848505258169346, + "learning_rate": 2.5401510739130668e-05, + "loss": 0.8896, + "step": 7247 + }, + { + "epoch": 3.28558476881233, + "grad_norm": 0.22258822136290637, + "learning_rate": 2.5389725863855032e-05, + "loss": 0.864, + "step": 7248 + }, + { + "epoch": 3.286038077969175, + "grad_norm": 0.22595528151524127, + "learning_rate": 2.53779424518413e-05, + "loss": 0.8627, + "step": 7249 + }, + { + "epoch": 3.28649138712602, + "grad_norm": 0.2721200093799047, + "learning_rate": 2.53661605042696e-05, + "loss": 0.8473, + "step": 7250 + }, + { + "epoch": 3.286944696282865, + "grad_norm": 0.18546756174947535, + "learning_rate": 2.5354380022319936e-05, + "loss": 0.8545, + "step": 7251 + }, + { + "epoch": 3.28739800543971, + "grad_norm": 0.24921288731354105, + "learning_rate": 2.534260100717215e-05, + "loss": 0.8617, + "step": 7252 + }, + { + "epoch": 3.287851314596555, + "grad_norm": 0.2606124915070228, + "learning_rate": 2.533082346000595e-05, + "loss": 0.8679, + "step": 7253 + }, + { + "epoch": 3.2883046237533997, + "grad_norm": 0.22189280964315736, + "learning_rate": 2.5319047382000893e-05, + "loss": 0.8626, + "step": 7254 + }, + { + "epoch": 3.2887579329102445, + "grad_norm": 0.27493741387362824, + "learning_rate": 2.5307272774336386e-05, + "loss": 0.863, + "step": 7255 + }, + { + "epoch": 3.28921124206709, + "grad_norm": 0.1905396804426044, + "learning_rate": 2.5295499638191698e-05, + "loss": 0.8593, + "step": 7256 + }, + { + "epoch": 3.2896645512239346, + "grad_norm": 0.2635559493406132, + "learning_rate": 2.528372797474595e-05, + "loss": 0.8881, + "step": 7257 + }, + { + "epoch": 3.2901178603807795, + "grad_norm": 0.26346238150119716, + "learning_rate": 2.5271957785178075e-05, + "loss": 0.8569, + "step": 7258 + }, + { + "epoch": 3.2905711695376247, + "grad_norm": 0.20083983580494802, + "learning_rate": 2.5260189070666918e-05, + "loss": 0.8891, + "step": 7259 + }, + { + "epoch": 3.2910244786944696, + "grad_norm": 0.2292377610820346, + "learning_rate": 2.5248421832391144e-05, + "loss": 0.8854, + "step": 7260 + }, + { + "epoch": 3.2914777878513144, + "grad_norm": 0.20237991796976126, + "learning_rate": 2.5236656071529274e-05, + "loss": 0.866, + "step": 7261 + }, + { + "epoch": 3.2919310970081597, + "grad_norm": 0.20507197050001277, + "learning_rate": 2.522489178925969e-05, + "loss": 0.8813, + "step": 7262 + }, + { + "epoch": 3.2923844061650045, + "grad_norm": 0.265520387977556, + "learning_rate": 2.5213128986760618e-05, + "loss": 0.8667, + "step": 7263 + }, + { + "epoch": 3.2928377153218493, + "grad_norm": 0.2717322762172908, + "learning_rate": 2.5201367665210134e-05, + "loss": 0.8537, + "step": 7264 + }, + { + "epoch": 3.2932910244786946, + "grad_norm": 0.21465976784970978, + "learning_rate": 2.518960782578618e-05, + "loss": 0.8598, + "step": 7265 + }, + { + "epoch": 3.2937443336355394, + "grad_norm": 0.2636118284072555, + "learning_rate": 2.5177849469666517e-05, + "loss": 0.8727, + "step": 7266 + }, + { + "epoch": 3.2941976427923843, + "grad_norm": 0.3108380905519306, + "learning_rate": 2.5166092598028796e-05, + "loss": 0.8578, + "step": 7267 + }, + { + "epoch": 3.2946509519492295, + "grad_norm": 0.2714714666092737, + "learning_rate": 2.5154337212050496e-05, + "loss": 0.8542, + "step": 7268 + }, + { + "epoch": 3.2951042611060744, + "grad_norm": 0.25082476687018124, + "learning_rate": 2.5142583312908952e-05, + "loss": 0.8523, + "step": 7269 + }, + { + "epoch": 3.295557570262919, + "grad_norm": 0.2272134057052852, + "learning_rate": 2.513083090178136e-05, + "loss": 0.8603, + "step": 7270 + }, + { + "epoch": 3.2960108794197644, + "grad_norm": 0.301578628426096, + "learning_rate": 2.5119079979844746e-05, + "loss": 0.88, + "step": 7271 + }, + { + "epoch": 3.2964641885766093, + "grad_norm": 0.2847572128122353, + "learning_rate": 2.5107330548276015e-05, + "loss": 0.8667, + "step": 7272 + }, + { + "epoch": 3.296917497733454, + "grad_norm": 0.2590092404066702, + "learning_rate": 2.5095582608251894e-05, + "loss": 0.8946, + "step": 7273 + }, + { + "epoch": 3.2973708068902994, + "grad_norm": 0.20103470323713682, + "learning_rate": 2.5083836160948973e-05, + "loss": 0.8633, + "step": 7274 + }, + { + "epoch": 3.297824116047144, + "grad_norm": 0.21321814257612687, + "learning_rate": 2.5072091207543715e-05, + "loss": 0.8812, + "step": 7275 + }, + { + "epoch": 3.298277425203989, + "grad_norm": 0.25076068659672346, + "learning_rate": 2.506034774921238e-05, + "loss": 0.8603, + "step": 7276 + }, + { + "epoch": 3.2987307343608343, + "grad_norm": 0.18444601042255962, + "learning_rate": 2.504860578713114e-05, + "loss": 0.8556, + "step": 7277 + }, + { + "epoch": 3.299184043517679, + "grad_norm": 0.22297657833293305, + "learning_rate": 2.5036865322475955e-05, + "loss": 0.8823, + "step": 7278 + }, + { + "epoch": 3.299637352674524, + "grad_norm": 0.19050893461929927, + "learning_rate": 2.5025126356422692e-05, + "loss": 0.8749, + "step": 7279 + }, + { + "epoch": 3.3000906618313692, + "grad_norm": 0.16719433119134694, + "learning_rate": 2.5013388890147033e-05, + "loss": 0.8782, + "step": 7280 + }, + { + "epoch": 3.300543970988214, + "grad_norm": 0.1848411822833879, + "learning_rate": 2.500165292482452e-05, + "loss": 0.8499, + "step": 7281 + }, + { + "epoch": 3.300997280145059, + "grad_norm": 0.18356294524521377, + "learning_rate": 2.4989918461630546e-05, + "loss": 0.8785, + "step": 7282 + }, + { + "epoch": 3.3014505893019037, + "grad_norm": 0.17451290636808678, + "learning_rate": 2.4978185501740352e-05, + "loss": 0.8676, + "step": 7283 + }, + { + "epoch": 3.301903898458749, + "grad_norm": 0.22122964906203751, + "learning_rate": 2.4966454046329026e-05, + "loss": 0.8626, + "step": 7284 + }, + { + "epoch": 3.302357207615594, + "grad_norm": 0.19709977064913595, + "learning_rate": 2.4954724096571513e-05, + "loss": 0.8607, + "step": 7285 + }, + { + "epoch": 3.3028105167724386, + "grad_norm": 0.190293720782103, + "learning_rate": 2.4942995653642594e-05, + "loss": 0.8957, + "step": 7286 + }, + { + "epoch": 3.303263825929284, + "grad_norm": 0.24024737597894244, + "learning_rate": 2.4931268718716918e-05, + "loss": 0.8654, + "step": 7287 + }, + { + "epoch": 3.3037171350861287, + "grad_norm": 0.207121067492197, + "learning_rate": 2.4919543292968962e-05, + "loss": 0.8624, + "step": 7288 + }, + { + "epoch": 3.3041704442429736, + "grad_norm": 0.22508452056684725, + "learning_rate": 2.490781937757307e-05, + "loss": 0.8625, + "step": 7289 + }, + { + "epoch": 3.304623753399819, + "grad_norm": 0.1998057100064862, + "learning_rate": 2.4896096973703417e-05, + "loss": 0.8754, + "step": 7290 + }, + { + "epoch": 3.3050770625566637, + "grad_norm": 0.24820569114486504, + "learning_rate": 2.4884376082534053e-05, + "loss": 0.8586, + "step": 7291 + }, + { + "epoch": 3.3055303717135085, + "grad_norm": 0.19968215980035242, + "learning_rate": 2.4872656705238835e-05, + "loss": 0.872, + "step": 7292 + }, + { + "epoch": 3.3059836808703538, + "grad_norm": 0.19214071303869337, + "learning_rate": 2.4860938842991514e-05, + "loss": 0.8793, + "step": 7293 + }, + { + "epoch": 3.3064369900271986, + "grad_norm": 0.2323806203165372, + "learning_rate": 2.4849222496965658e-05, + "loss": 0.8567, + "step": 7294 + }, + { + "epoch": 3.3068902991840434, + "grad_norm": 0.19161132451226673, + "learning_rate": 2.4837507668334694e-05, + "loss": 0.8707, + "step": 7295 + }, + { + "epoch": 3.3073436083408883, + "grad_norm": 0.23322998237349055, + "learning_rate": 2.4825794358271917e-05, + "loss": 0.8787, + "step": 7296 + }, + { + "epoch": 3.3077969174977335, + "grad_norm": 0.21036339068500157, + "learning_rate": 2.4814082567950415e-05, + "loss": 0.8702, + "step": 7297 + }, + { + "epoch": 3.3082502266545784, + "grad_norm": 0.20403518421764388, + "learning_rate": 2.4802372298543178e-05, + "loss": 0.8748, + "step": 7298 + }, + { + "epoch": 3.308703535811423, + "grad_norm": 0.23622886929569048, + "learning_rate": 2.479066355122301e-05, + "loss": 0.8512, + "step": 7299 + }, + { + "epoch": 3.3091568449682685, + "grad_norm": 0.1864450431543695, + "learning_rate": 2.4778956327162597e-05, + "loss": 0.8552, + "step": 7300 + }, + { + "epoch": 3.3096101541251133, + "grad_norm": 0.18520853517174934, + "learning_rate": 2.4767250627534434e-05, + "loss": 0.8751, + "step": 7301 + }, + { + "epoch": 3.310063463281958, + "grad_norm": 0.18841583044131854, + "learning_rate": 2.47555464535109e-05, + "loss": 0.8642, + "step": 7302 + }, + { + "epoch": 3.3105167724388034, + "grad_norm": 0.204737919244479, + "learning_rate": 2.4743843806264184e-05, + "loss": 0.8652, + "step": 7303 + }, + { + "epoch": 3.310970081595648, + "grad_norm": 0.1910250736817801, + "learning_rate": 2.4732142686966344e-05, + "loss": 0.8704, + "step": 7304 + }, + { + "epoch": 3.311423390752493, + "grad_norm": 0.18115235878352715, + "learning_rate": 2.4720443096789292e-05, + "loss": 0.8985, + "step": 7305 + }, + { + "epoch": 3.3118766999093383, + "grad_norm": 0.21460429465542083, + "learning_rate": 2.4708745036904774e-05, + "loss": 0.8795, + "step": 7306 + }, + { + "epoch": 3.312330009066183, + "grad_norm": 0.19909666867892925, + "learning_rate": 2.4697048508484373e-05, + "loss": 0.8583, + "step": 7307 + }, + { + "epoch": 3.312783318223028, + "grad_norm": 0.24230289134540084, + "learning_rate": 2.468535351269954e-05, + "loss": 0.8725, + "step": 7308 + }, + { + "epoch": 3.3132366273798732, + "grad_norm": 0.21925259548345294, + "learning_rate": 2.467366005072156e-05, + "loss": 0.8573, + "step": 7309 + }, + { + "epoch": 3.313689936536718, + "grad_norm": 0.1997972022707895, + "learning_rate": 2.466196812372157e-05, + "loss": 0.8661, + "step": 7310 + }, + { + "epoch": 3.314143245693563, + "grad_norm": 0.29248434643319643, + "learning_rate": 2.4650277732870552e-05, + "loss": 0.8644, + "step": 7311 + }, + { + "epoch": 3.314596554850408, + "grad_norm": 0.24332391654326607, + "learning_rate": 2.4638588879339325e-05, + "loss": 0.8629, + "step": 7312 + }, + { + "epoch": 3.315049864007253, + "grad_norm": 0.2235305324364914, + "learning_rate": 2.462690156429857e-05, + "loss": 0.8587, + "step": 7313 + }, + { + "epoch": 3.315503173164098, + "grad_norm": 0.21868481450165356, + "learning_rate": 2.461521578891881e-05, + "loss": 0.8712, + "step": 7314 + }, + { + "epoch": 3.315956482320943, + "grad_norm": 0.23378131550377254, + "learning_rate": 2.4603531554370417e-05, + "loss": 0.8653, + "step": 7315 + }, + { + "epoch": 3.316409791477788, + "grad_norm": 0.24250726082483456, + "learning_rate": 2.4591848861823567e-05, + "loss": 0.8476, + "step": 7316 + }, + { + "epoch": 3.3168631006346327, + "grad_norm": 0.2441629900851792, + "learning_rate": 2.458016771244834e-05, + "loss": 0.8664, + "step": 7317 + }, + { + "epoch": 3.317316409791478, + "grad_norm": 0.23165313845685076, + "learning_rate": 2.4568488107414638e-05, + "loss": 0.8725, + "step": 7318 + }, + { + "epoch": 3.317769718948323, + "grad_norm": 0.23936281132233062, + "learning_rate": 2.45568100478922e-05, + "loss": 0.8637, + "step": 7319 + }, + { + "epoch": 3.3182230281051677, + "grad_norm": 0.2210992912502654, + "learning_rate": 2.4545133535050628e-05, + "loss": 0.8799, + "step": 7320 + }, + { + "epoch": 3.3186763372620125, + "grad_norm": 0.22893085916322153, + "learning_rate": 2.453345857005936e-05, + "loss": 0.8516, + "step": 7321 + }, + { + "epoch": 3.3191296464188578, + "grad_norm": 0.21229239740435452, + "learning_rate": 2.4521785154087662e-05, + "loss": 0.8673, + "step": 7322 + }, + { + "epoch": 3.3195829555757026, + "grad_norm": 0.17762506438784695, + "learning_rate": 2.451011328830467e-05, + "loss": 0.885, + "step": 7323 + }, + { + "epoch": 3.3200362647325474, + "grad_norm": 0.24127032497896142, + "learning_rate": 2.4498442973879354e-05, + "loss": 0.8704, + "step": 7324 + }, + { + "epoch": 3.3204895738893927, + "grad_norm": 0.2461485331126551, + "learning_rate": 2.4486774211980535e-05, + "loss": 0.8862, + "step": 7325 + }, + { + "epoch": 3.3209428830462375, + "grad_norm": 0.28523635224798305, + "learning_rate": 2.447510700377687e-05, + "loss": 0.8601, + "step": 7326 + }, + { + "epoch": 3.3213961922030824, + "grad_norm": 0.23174445123997792, + "learning_rate": 2.4463441350436865e-05, + "loss": 0.8651, + "step": 7327 + }, + { + "epoch": 3.3218495013599276, + "grad_norm": 0.2113625478619112, + "learning_rate": 2.4451777253128873e-05, + "loss": 0.8771, + "step": 7328 + }, + { + "epoch": 3.3223028105167725, + "grad_norm": 0.24158191084383612, + "learning_rate": 2.4440114713021085e-05, + "loss": 0.8607, + "step": 7329 + }, + { + "epoch": 3.3227561196736173, + "grad_norm": 0.26175817614005087, + "learning_rate": 2.4428453731281538e-05, + "loss": 0.878, + "step": 7330 + }, + { + "epoch": 3.3232094288304626, + "grad_norm": 0.20607619610403752, + "learning_rate": 2.4416794309078108e-05, + "loss": 0.8633, + "step": 7331 + }, + { + "epoch": 3.3236627379873074, + "grad_norm": 0.2622948139145269, + "learning_rate": 2.4405136447578538e-05, + "loss": 0.8707, + "step": 7332 + }, + { + "epoch": 3.324116047144152, + "grad_norm": 0.3376855775675858, + "learning_rate": 2.4393480147950372e-05, + "loss": 0.8559, + "step": 7333 + }, + { + "epoch": 3.324569356300997, + "grad_norm": 0.31050753514646545, + "learning_rate": 2.4381825411361052e-05, + "loss": 0.8715, + "step": 7334 + }, + { + "epoch": 3.3250226654578423, + "grad_norm": 0.21963012220211064, + "learning_rate": 2.4370172238977806e-05, + "loss": 0.8758, + "step": 7335 + }, + { + "epoch": 3.325475974614687, + "grad_norm": 0.21140457357324552, + "learning_rate": 2.4358520631967747e-05, + "loss": 0.8665, + "step": 7336 + }, + { + "epoch": 3.325929283771532, + "grad_norm": 0.19903680639872237, + "learning_rate": 2.4346870591497825e-05, + "loss": 0.8756, + "step": 7337 + }, + { + "epoch": 3.3263825929283772, + "grad_norm": 0.22298637238334781, + "learning_rate": 2.4335222118734806e-05, + "loss": 0.8755, + "step": 7338 + }, + { + "epoch": 3.326835902085222, + "grad_norm": 0.19708223178574724, + "learning_rate": 2.4323575214845333e-05, + "loss": 0.8731, + "step": 7339 + }, + { + "epoch": 3.327289211242067, + "grad_norm": 0.18795208243878114, + "learning_rate": 2.431192988099587e-05, + "loss": 0.8787, + "step": 7340 + }, + { + "epoch": 3.327742520398912, + "grad_norm": 0.20518206568948014, + "learning_rate": 2.430028611835274e-05, + "loss": 0.8812, + "step": 7341 + }, + { + "epoch": 3.328195829555757, + "grad_norm": 0.2027304199896448, + "learning_rate": 2.4288643928082092e-05, + "loss": 0.8738, + "step": 7342 + }, + { + "epoch": 3.328649138712602, + "grad_norm": 0.1613764826880516, + "learning_rate": 2.427700331134993e-05, + "loss": 0.8691, + "step": 7343 + }, + { + "epoch": 3.329102447869447, + "grad_norm": 0.18225686863459367, + "learning_rate": 2.4265364269322103e-05, + "loss": 0.877, + "step": 7344 + }, + { + "epoch": 3.329555757026292, + "grad_norm": 0.17658127210368954, + "learning_rate": 2.4253726803164276e-05, + "loss": 0.8818, + "step": 7345 + }, + { + "epoch": 3.3300090661831367, + "grad_norm": 0.19599154665588794, + "learning_rate": 2.4242090914041994e-05, + "loss": 0.8694, + "step": 7346 + }, + { + "epoch": 3.330462375339982, + "grad_norm": 0.27954880004017446, + "learning_rate": 2.4230456603120626e-05, + "loss": 0.8887, + "step": 7347 + }, + { + "epoch": 3.330915684496827, + "grad_norm": 0.21478286203263205, + "learning_rate": 2.4218823871565365e-05, + "loss": 0.8618, + "step": 7348 + }, + { + "epoch": 3.3313689936536717, + "grad_norm": 0.19853589863993507, + "learning_rate": 2.4207192720541278e-05, + "loss": 0.8549, + "step": 7349 + }, + { + "epoch": 3.331822302810517, + "grad_norm": 0.21430624242880425, + "learning_rate": 2.4195563151213248e-05, + "loss": 0.8567, + "step": 7350 + }, + { + "epoch": 3.3322756119673618, + "grad_norm": 0.1930305847825246, + "learning_rate": 2.418393516474602e-05, + "loss": 0.856, + "step": 7351 + }, + { + "epoch": 3.3327289211242066, + "grad_norm": 0.20759351201767337, + "learning_rate": 2.4172308762304168e-05, + "loss": 0.8591, + "step": 7352 + }, + { + "epoch": 3.333182230281052, + "grad_norm": 0.24712659785174884, + "learning_rate": 2.416068394505213e-05, + "loss": 0.8805, + "step": 7353 + }, + { + "epoch": 3.3336355394378967, + "grad_norm": 0.18672882140340935, + "learning_rate": 2.414906071415412e-05, + "loss": 0.8643, + "step": 7354 + }, + { + "epoch": 3.3340888485947415, + "grad_norm": 0.21021783900534544, + "learning_rate": 2.413743907077427e-05, + "loss": 0.8698, + "step": 7355 + }, + { + "epoch": 3.334542157751587, + "grad_norm": 0.19718203498016199, + "learning_rate": 2.412581901607652e-05, + "loss": 0.8582, + "step": 7356 + }, + { + "epoch": 3.3349954669084316, + "grad_norm": 0.18433164811936728, + "learning_rate": 2.4114200551224648e-05, + "loss": 0.8409, + "step": 7357 + }, + { + "epoch": 3.3354487760652765, + "grad_norm": 0.16545551273115316, + "learning_rate": 2.4102583677382276e-05, + "loss": 0.8735, + "step": 7358 + }, + { + "epoch": 3.3359020852221217, + "grad_norm": 0.21617974878378216, + "learning_rate": 2.4090968395712864e-05, + "loss": 0.8807, + "step": 7359 + }, + { + "epoch": 3.3363553943789666, + "grad_norm": 0.19954472816202595, + "learning_rate": 2.407935470737973e-05, + "loss": 0.8698, + "step": 7360 + }, + { + "epoch": 3.3368087035358114, + "grad_norm": 0.18047461840520118, + "learning_rate": 2.406774261354601e-05, + "loss": 0.8799, + "step": 7361 + }, + { + "epoch": 3.337262012692656, + "grad_norm": 0.18050448595585988, + "learning_rate": 2.4056132115374692e-05, + "loss": 0.8834, + "step": 7362 + }, + { + "epoch": 3.3377153218495015, + "grad_norm": 0.16112310142392453, + "learning_rate": 2.404452321402861e-05, + "loss": 0.8667, + "step": 7363 + }, + { + "epoch": 3.3381686310063463, + "grad_norm": 0.1843941879408786, + "learning_rate": 2.403291591067041e-05, + "loss": 0.8465, + "step": 7364 + }, + { + "epoch": 3.338621940163191, + "grad_norm": 0.177188021794928, + "learning_rate": 2.4021310206462608e-05, + "loss": 0.8626, + "step": 7365 + }, + { + "epoch": 3.3390752493200364, + "grad_norm": 0.18652845947659233, + "learning_rate": 2.4009706102567553e-05, + "loss": 0.879, + "step": 7366 + }, + { + "epoch": 3.3395285584768812, + "grad_norm": 0.20489878396214775, + "learning_rate": 2.3998103600147422e-05, + "loss": 0.8773, + "step": 7367 + }, + { + "epoch": 3.339981867633726, + "grad_norm": 0.22083212453605092, + "learning_rate": 2.3986502700364246e-05, + "loss": 0.8524, + "step": 7368 + }, + { + "epoch": 3.3404351767905713, + "grad_norm": 0.21995459952993993, + "learning_rate": 2.3974903404379887e-05, + "loss": 0.8643, + "step": 7369 + }, + { + "epoch": 3.340888485947416, + "grad_norm": 0.15949731449055143, + "learning_rate": 2.3963305713356052e-05, + "loss": 0.8679, + "step": 7370 + }, + { + "epoch": 3.341341795104261, + "grad_norm": 0.18708736957314825, + "learning_rate": 2.3951709628454277e-05, + "loss": 0.8707, + "step": 7371 + }, + { + "epoch": 3.3417951042611063, + "grad_norm": 0.19903749205162952, + "learning_rate": 2.394011515083595e-05, + "loss": 0.8679, + "step": 7372 + }, + { + "epoch": 3.342248413417951, + "grad_norm": 0.2198274561265706, + "learning_rate": 2.3928522281662304e-05, + "loss": 0.8606, + "step": 7373 + }, + { + "epoch": 3.342701722574796, + "grad_norm": 0.1867468846513203, + "learning_rate": 2.3916931022094366e-05, + "loss": 0.8639, + "step": 7374 + }, + { + "epoch": 3.3431550317316407, + "grad_norm": 0.22919054544778897, + "learning_rate": 2.390534137329306e-05, + "loss": 0.8953, + "step": 7375 + }, + { + "epoch": 3.343608340888486, + "grad_norm": 0.19387773856821036, + "learning_rate": 2.3893753336419114e-05, + "loss": 0.8555, + "step": 7376 + }, + { + "epoch": 3.344061650045331, + "grad_norm": 0.24287987829428162, + "learning_rate": 2.3882166912633108e-05, + "loss": 0.8634, + "step": 7377 + }, + { + "epoch": 3.3445149592021757, + "grad_norm": 0.25249120461952407, + "learning_rate": 2.3870582103095455e-05, + "loss": 0.8649, + "step": 7378 + }, + { + "epoch": 3.344968268359021, + "grad_norm": 0.2341102607994931, + "learning_rate": 2.385899890896641e-05, + "loss": 0.8602, + "step": 7379 + }, + { + "epoch": 3.3454215775158658, + "grad_norm": 0.24116884045267623, + "learning_rate": 2.3847417331406057e-05, + "loss": 0.8652, + "step": 7380 + }, + { + "epoch": 3.3458748866727106, + "grad_norm": 0.20507410340346333, + "learning_rate": 2.3835837371574334e-05, + "loss": 0.8671, + "step": 7381 + }, + { + "epoch": 3.346328195829556, + "grad_norm": 0.22946811562919794, + "learning_rate": 2.3824259030630998e-05, + "loss": 0.8711, + "step": 7382 + }, + { + "epoch": 3.3467815049864007, + "grad_norm": 0.18128752899838677, + "learning_rate": 2.3812682309735663e-05, + "loss": 0.8596, + "step": 7383 + }, + { + "epoch": 3.3472348141432455, + "grad_norm": 0.2003297669859454, + "learning_rate": 2.3801107210047773e-05, + "loss": 0.8693, + "step": 7384 + }, + { + "epoch": 3.347688123300091, + "grad_norm": 0.18215673558485823, + "learning_rate": 2.3789533732726593e-05, + "loss": 0.8719, + "step": 7385 + }, + { + "epoch": 3.3481414324569356, + "grad_norm": 0.2026371081927297, + "learning_rate": 2.3777961878931258e-05, + "loss": 0.87, + "step": 7386 + }, + { + "epoch": 3.3485947416137805, + "grad_norm": 0.2395166034871391, + "learning_rate": 2.3766391649820718e-05, + "loss": 0.8643, + "step": 7387 + }, + { + "epoch": 3.3490480507706257, + "grad_norm": 0.2341482814739495, + "learning_rate": 2.3754823046553755e-05, + "loss": 0.8615, + "step": 7388 + }, + { + "epoch": 3.3495013599274706, + "grad_norm": 0.1886844424067505, + "learning_rate": 2.3743256070289027e-05, + "loss": 0.8652, + "step": 7389 + }, + { + "epoch": 3.3499546690843154, + "grad_norm": 0.2709012875314153, + "learning_rate": 2.3731690722184964e-05, + "loss": 0.8632, + "step": 7390 + }, + { + "epoch": 3.3504079782411607, + "grad_norm": 0.25649832351948726, + "learning_rate": 2.3720127003399896e-05, + "loss": 0.8608, + "step": 7391 + }, + { + "epoch": 3.3508612873980055, + "grad_norm": 0.2124591997166793, + "learning_rate": 2.3708564915091953e-05, + "loss": 0.8647, + "step": 7392 + }, + { + "epoch": 3.3513145965548503, + "grad_norm": 0.24983567369021387, + "learning_rate": 2.3697004458419118e-05, + "loss": 0.8522, + "step": 7393 + }, + { + "epoch": 3.3517679057116956, + "grad_norm": 0.17268122068106462, + "learning_rate": 2.3685445634539193e-05, + "loss": 0.8737, + "step": 7394 + }, + { + "epoch": 3.3522212148685404, + "grad_norm": 0.20172581727455213, + "learning_rate": 2.367388844460983e-05, + "loss": 0.8763, + "step": 7395 + }, + { + "epoch": 3.3526745240253852, + "grad_norm": 0.21872353203318345, + "learning_rate": 2.3662332889788527e-05, + "loss": 0.8731, + "step": 7396 + }, + { + "epoch": 3.3531278331822305, + "grad_norm": 0.2037231689179369, + "learning_rate": 2.3650778971232596e-05, + "loss": 0.8574, + "step": 7397 + }, + { + "epoch": 3.3535811423390753, + "grad_norm": 0.21459331830167439, + "learning_rate": 2.3639226690099206e-05, + "loss": 0.8731, + "step": 7398 + }, + { + "epoch": 3.35403445149592, + "grad_norm": 0.2558054385179132, + "learning_rate": 2.3627676047545344e-05, + "loss": 0.8618, + "step": 7399 + }, + { + "epoch": 3.354487760652765, + "grad_norm": 0.21232182859249366, + "learning_rate": 2.3616127044727842e-05, + "loss": 0.8771, + "step": 7400 + }, + { + "epoch": 3.3549410698096103, + "grad_norm": 0.204634373559339, + "learning_rate": 2.360457968280337e-05, + "loss": 0.8585, + "step": 7401 + }, + { + "epoch": 3.355394378966455, + "grad_norm": 0.2501078674095112, + "learning_rate": 2.3593033962928423e-05, + "loss": 0.8612, + "step": 7402 + }, + { + "epoch": 3.3558476881233, + "grad_norm": 0.1806040038598417, + "learning_rate": 2.358148988625935e-05, + "loss": 0.8708, + "step": 7403 + }, + { + "epoch": 3.356300997280145, + "grad_norm": 0.23622545208827514, + "learning_rate": 2.3569947453952324e-05, + "loss": 0.8759, + "step": 7404 + }, + { + "epoch": 3.35675430643699, + "grad_norm": 0.21687662497839733, + "learning_rate": 2.3558406667163337e-05, + "loss": 0.8778, + "step": 7405 + }, + { + "epoch": 3.357207615593835, + "grad_norm": 0.2774970759241026, + "learning_rate": 2.354686752704824e-05, + "loss": 0.8658, + "step": 7406 + }, + { + "epoch": 3.35766092475068, + "grad_norm": 0.26984403554044495, + "learning_rate": 2.3535330034762724e-05, + "loss": 0.8732, + "step": 7407 + }, + { + "epoch": 3.358114233907525, + "grad_norm": 0.18547867862586442, + "learning_rate": 2.352379419146229e-05, + "loss": 0.8576, + "step": 7408 + }, + { + "epoch": 3.3585675430643698, + "grad_norm": 0.24821102663475797, + "learning_rate": 2.3512259998302288e-05, + "loss": 0.8631, + "step": 7409 + }, + { + "epoch": 3.359020852221215, + "grad_norm": 0.19678993661593217, + "learning_rate": 2.3500727456437904e-05, + "loss": 0.877, + "step": 7410 + }, + { + "epoch": 3.35947416137806, + "grad_norm": 0.22606500707321076, + "learning_rate": 2.3489196567024175e-05, + "loss": 0.8695, + "step": 7411 + }, + { + "epoch": 3.3599274705349047, + "grad_norm": 0.18882672352257138, + "learning_rate": 2.347766733121592e-05, + "loss": 0.853, + "step": 7412 + }, + { + "epoch": 3.3603807796917495, + "grad_norm": 0.1940982161568819, + "learning_rate": 2.3466139750167837e-05, + "loss": 0.8616, + "step": 7413 + }, + { + "epoch": 3.360834088848595, + "grad_norm": 0.19009118497056912, + "learning_rate": 2.3454613825034444e-05, + "loss": 0.8852, + "step": 7414 + }, + { + "epoch": 3.3612873980054396, + "grad_norm": 0.19438061184982935, + "learning_rate": 2.3443089556970112e-05, + "loss": 0.8607, + "step": 7415 + }, + { + "epoch": 3.3617407071622845, + "grad_norm": 0.2011566913408708, + "learning_rate": 2.3431566947129017e-05, + "loss": 0.8699, + "step": 7416 + }, + { + "epoch": 3.3621940163191297, + "grad_norm": 0.18734744372917567, + "learning_rate": 2.3420045996665186e-05, + "loss": 0.8751, + "step": 7417 + }, + { + "epoch": 3.3626473254759746, + "grad_norm": 0.1808277257946062, + "learning_rate": 2.3408526706732476e-05, + "loss": 0.8462, + "step": 7418 + }, + { + "epoch": 3.3631006346328194, + "grad_norm": 0.1989359609309082, + "learning_rate": 2.3397009078484587e-05, + "loss": 0.8696, + "step": 7419 + }, + { + "epoch": 3.3635539437896647, + "grad_norm": 0.21067931947981686, + "learning_rate": 2.338549311307502e-05, + "loss": 0.8603, + "step": 7420 + }, + { + "epoch": 3.3640072529465095, + "grad_norm": 0.1759165379699468, + "learning_rate": 2.3373978811657155e-05, + "loss": 0.8727, + "step": 7421 + }, + { + "epoch": 3.3644605621033543, + "grad_norm": 0.21974659424340098, + "learning_rate": 2.336246617538417e-05, + "loss": 0.8612, + "step": 7422 + }, + { + "epoch": 3.3649138712601996, + "grad_norm": 0.19209284509376004, + "learning_rate": 2.3350955205409092e-05, + "loss": 0.8685, + "step": 7423 + }, + { + "epoch": 3.3653671804170444, + "grad_norm": 0.18115003896541887, + "learning_rate": 2.333944590288479e-05, + "loss": 0.8766, + "step": 7424 + }, + { + "epoch": 3.3658204895738892, + "grad_norm": 0.2020460377167516, + "learning_rate": 2.3327938268963947e-05, + "loss": 0.8413, + "step": 7425 + }, + { + "epoch": 3.3662737987307345, + "grad_norm": 0.19734650860748784, + "learning_rate": 2.331643230479907e-05, + "loss": 0.8786, + "step": 7426 + }, + { + "epoch": 3.3667271078875793, + "grad_norm": 0.2323050860076111, + "learning_rate": 2.3304928011542546e-05, + "loss": 0.8671, + "step": 7427 + }, + { + "epoch": 3.367180417044424, + "grad_norm": 0.2630722282305213, + "learning_rate": 2.3293425390346533e-05, + "loss": 0.8583, + "step": 7428 + }, + { + "epoch": 3.3676337262012694, + "grad_norm": 0.20565599413172445, + "learning_rate": 2.328192444236308e-05, + "loss": 0.8695, + "step": 7429 + }, + { + "epoch": 3.3680870353581143, + "grad_norm": 0.303351467252471, + "learning_rate": 2.3270425168744025e-05, + "loss": 0.8477, + "step": 7430 + }, + { + "epoch": 3.368540344514959, + "grad_norm": 0.26461492030266315, + "learning_rate": 2.3258927570641064e-05, + "loss": 0.879, + "step": 7431 + }, + { + "epoch": 3.3689936536718044, + "grad_norm": 0.2298735742580249, + "learning_rate": 2.3247431649205687e-05, + "loss": 0.886, + "step": 7432 + }, + { + "epoch": 3.369446962828649, + "grad_norm": 0.26033882821781745, + "learning_rate": 2.3235937405589282e-05, + "loss": 0.8689, + "step": 7433 + }, + { + "epoch": 3.369900271985494, + "grad_norm": 0.16336328375863335, + "learning_rate": 2.3224444840943e-05, + "loss": 0.8473, + "step": 7434 + }, + { + "epoch": 3.3703535811423393, + "grad_norm": 0.27477713307106594, + "learning_rate": 2.3212953956417885e-05, + "loss": 0.8624, + "step": 7435 + }, + { + "epoch": 3.370806890299184, + "grad_norm": 0.2181918083460668, + "learning_rate": 2.3201464753164752e-05, + "loss": 0.8811, + "step": 7436 + }, + { + "epoch": 3.371260199456029, + "grad_norm": 0.27588547352358184, + "learning_rate": 2.3189977232334307e-05, + "loss": 0.8743, + "step": 7437 + }, + { + "epoch": 3.371713508612874, + "grad_norm": 0.23573734801351376, + "learning_rate": 2.317849139507703e-05, + "loss": 0.8778, + "step": 7438 + }, + { + "epoch": 3.372166817769719, + "grad_norm": 0.2472013573139642, + "learning_rate": 2.3167007242543296e-05, + "loss": 0.8732, + "step": 7439 + }, + { + "epoch": 3.372620126926564, + "grad_norm": 0.36694398535801076, + "learning_rate": 2.3155524775883258e-05, + "loss": 0.8615, + "step": 7440 + }, + { + "epoch": 3.3730734360834087, + "grad_norm": 0.28697346873489027, + "learning_rate": 2.31440439962469e-05, + "loss": 0.8724, + "step": 7441 + }, + { + "epoch": 3.373526745240254, + "grad_norm": 0.19676341700838945, + "learning_rate": 2.313256490478409e-05, + "loss": 0.877, + "step": 7442 + }, + { + "epoch": 3.373980054397099, + "grad_norm": 0.23999952474616656, + "learning_rate": 2.3121087502644464e-05, + "loss": 0.863, + "step": 7443 + }, + { + "epoch": 3.3744333635539436, + "grad_norm": 0.2725024423702691, + "learning_rate": 2.310961179097755e-05, + "loss": 0.8557, + "step": 7444 + }, + { + "epoch": 3.374886672710789, + "grad_norm": 0.23494989240474498, + "learning_rate": 2.3098137770932634e-05, + "loss": 0.853, + "step": 7445 + }, + { + "epoch": 3.3753399818676337, + "grad_norm": 0.2637085146031462, + "learning_rate": 2.3086665443658908e-05, + "loss": 0.8819, + "step": 7446 + }, + { + "epoch": 3.3757932910244786, + "grad_norm": 0.26118478706564235, + "learning_rate": 2.307519481030534e-05, + "loss": 0.8836, + "step": 7447 + }, + { + "epoch": 3.376246600181324, + "grad_norm": 0.2873131136009189, + "learning_rate": 2.3063725872020757e-05, + "loss": 0.853, + "step": 7448 + }, + { + "epoch": 3.3766999093381687, + "grad_norm": 0.27211193513788245, + "learning_rate": 2.3052258629953807e-05, + "loss": 0.877, + "step": 7449 + }, + { + "epoch": 3.3771532184950135, + "grad_norm": 0.1959094392025743, + "learning_rate": 2.3040793085252964e-05, + "loss": 0.8742, + "step": 7450 + }, + { + "epoch": 3.3776065276518588, + "grad_norm": 0.3094167304418335, + "learning_rate": 2.302932923906652e-05, + "loss": 0.8563, + "step": 7451 + }, + { + "epoch": 3.3780598368087036, + "grad_norm": 0.3244158517640174, + "learning_rate": 2.3017867092542644e-05, + "loss": 0.8757, + "step": 7452 + }, + { + "epoch": 3.3785131459655484, + "grad_norm": 0.24659264148629498, + "learning_rate": 2.300640664682927e-05, + "loss": 0.8613, + "step": 7453 + }, + { + "epoch": 3.3789664551223932, + "grad_norm": 0.23067583437740566, + "learning_rate": 2.2994947903074234e-05, + "loss": 0.8745, + "step": 7454 + }, + { + "epoch": 3.3794197642792385, + "grad_norm": 0.2986831280045326, + "learning_rate": 2.298349086242514e-05, + "loss": 0.8646, + "step": 7455 + }, + { + "epoch": 3.3798730734360833, + "grad_norm": 0.27812657936852364, + "learning_rate": 2.2972035526029428e-05, + "loss": 0.8466, + "step": 7456 + }, + { + "epoch": 3.380326382592928, + "grad_norm": 0.1970299267220464, + "learning_rate": 2.2960581895034423e-05, + "loss": 0.8636, + "step": 7457 + }, + { + "epoch": 3.3807796917497734, + "grad_norm": 0.210424312270876, + "learning_rate": 2.2949129970587203e-05, + "loss": 0.8474, + "step": 7458 + }, + { + "epoch": 3.3812330009066183, + "grad_norm": 0.2585840369714823, + "learning_rate": 2.293767975383474e-05, + "loss": 0.8805, + "step": 7459 + }, + { + "epoch": 3.381686310063463, + "grad_norm": 0.2176418430021887, + "learning_rate": 2.2926231245923784e-05, + "loss": 0.8661, + "step": 7460 + }, + { + "epoch": 3.3821396192203084, + "grad_norm": 0.20230576807926184, + "learning_rate": 2.2914784448000963e-05, + "loss": 0.8617, + "step": 7461 + }, + { + "epoch": 3.382592928377153, + "grad_norm": 0.2569783425545793, + "learning_rate": 2.2903339361212672e-05, + "loss": 0.8708, + "step": 7462 + }, + { + "epoch": 3.383046237533998, + "grad_norm": 0.2186063666332499, + "learning_rate": 2.2891895986705207e-05, + "loss": 0.866, + "step": 7463 + }, + { + "epoch": 3.3834995466908433, + "grad_norm": 0.27078511199807215, + "learning_rate": 2.2880454325624625e-05, + "loss": 0.8842, + "step": 7464 + }, + { + "epoch": 3.383952855847688, + "grad_norm": 0.2497872654968895, + "learning_rate": 2.286901437911687e-05, + "loss": 0.8856, + "step": 7465 + }, + { + "epoch": 3.384406165004533, + "grad_norm": 0.20986508749779573, + "learning_rate": 2.2857576148327674e-05, + "loss": 0.8515, + "step": 7466 + }, + { + "epoch": 3.3848594741613782, + "grad_norm": 0.21837244220695784, + "learning_rate": 2.2846139634402586e-05, + "loss": 0.8721, + "step": 7467 + }, + { + "epoch": 3.385312783318223, + "grad_norm": 0.19057350836318765, + "learning_rate": 2.2834704838487047e-05, + "loss": 0.8682, + "step": 7468 + }, + { + "epoch": 3.385766092475068, + "grad_norm": 0.21104312827764415, + "learning_rate": 2.282327176172627e-05, + "loss": 0.8557, + "step": 7469 + }, + { + "epoch": 3.386219401631913, + "grad_norm": 0.22793510555738497, + "learning_rate": 2.2811840405265306e-05, + "loss": 0.8589, + "step": 7470 + }, + { + "epoch": 3.386672710788758, + "grad_norm": 0.1946928804485043, + "learning_rate": 2.280041077024902e-05, + "loss": 0.8651, + "step": 7471 + }, + { + "epoch": 3.387126019945603, + "grad_norm": 0.19920453007978534, + "learning_rate": 2.2788982857822167e-05, + "loss": 0.8948, + "step": 7472 + }, + { + "epoch": 3.387579329102448, + "grad_norm": 0.22700903089996052, + "learning_rate": 2.277755666912924e-05, + "loss": 0.852, + "step": 7473 + }, + { + "epoch": 3.388032638259293, + "grad_norm": 0.169270608016195, + "learning_rate": 2.2766132205314647e-05, + "loss": 0.8808, + "step": 7474 + }, + { + "epoch": 3.3884859474161377, + "grad_norm": 0.20182156293689776, + "learning_rate": 2.2754709467522548e-05, + "loss": 0.8564, + "step": 7475 + }, + { + "epoch": 3.388939256572983, + "grad_norm": 1.5602415528511289, + "learning_rate": 2.2743288456896995e-05, + "loss": 0.8755, + "step": 7476 + }, + { + "epoch": 3.389392565729828, + "grad_norm": 0.23273044034996854, + "learning_rate": 2.2731869174581802e-05, + "loss": 0.8642, + "step": 7477 + }, + { + "epoch": 3.3898458748866727, + "grad_norm": 0.2531043586666865, + "learning_rate": 2.272045162172068e-05, + "loss": 0.8561, + "step": 7478 + }, + { + "epoch": 3.3902991840435175, + "grad_norm": 0.21897949994681676, + "learning_rate": 2.270903579945709e-05, + "loss": 0.9056, + "step": 7479 + }, + { + "epoch": 3.3907524932003628, + "grad_norm": 0.2863465444316875, + "learning_rate": 2.2697621708934405e-05, + "loss": 0.8756, + "step": 7480 + }, + { + "epoch": 3.3912058023572076, + "grad_norm": 0.20299795733572118, + "learning_rate": 2.2686209351295752e-05, + "loss": 0.8722, + "step": 7481 + }, + { + "epoch": 3.3916591115140524, + "grad_norm": 0.25150488179257974, + "learning_rate": 2.2674798727684104e-05, + "loss": 0.8685, + "step": 7482 + }, + { + "epoch": 3.3921124206708977, + "grad_norm": 0.22187266441207362, + "learning_rate": 2.2663389839242294e-05, + "loss": 0.8597, + "step": 7483 + }, + { + "epoch": 3.3925657298277425, + "grad_norm": 0.28991145299765436, + "learning_rate": 2.2651982687112934e-05, + "loss": 0.859, + "step": 7484 + }, + { + "epoch": 3.3930190389845873, + "grad_norm": 0.2836074973650536, + "learning_rate": 2.2640577272438497e-05, + "loss": 0.8526, + "step": 7485 + }, + { + "epoch": 3.3934723481414326, + "grad_norm": 0.2768097583377648, + "learning_rate": 2.2629173596361254e-05, + "loss": 0.8829, + "step": 7486 + }, + { + "epoch": 3.3939256572982774, + "grad_norm": 0.21753831598239579, + "learning_rate": 2.2617771660023346e-05, + "loss": 0.866, + "step": 7487 + }, + { + "epoch": 3.3943789664551223, + "grad_norm": 0.18712235084581705, + "learning_rate": 2.2606371464566694e-05, + "loss": 0.8714, + "step": 7488 + }, + { + "epoch": 3.3948322756119675, + "grad_norm": 0.2331169066466924, + "learning_rate": 2.2594973011133048e-05, + "loss": 0.8455, + "step": 7489 + }, + { + "epoch": 3.3952855847688124, + "grad_norm": 0.1813034668852031, + "learning_rate": 2.2583576300863998e-05, + "loss": 0.8601, + "step": 7490 + }, + { + "epoch": 3.395738893925657, + "grad_norm": 0.2099933340929198, + "learning_rate": 2.2572181334900983e-05, + "loss": 0.8593, + "step": 7491 + }, + { + "epoch": 3.396192203082502, + "grad_norm": 0.24652328918889627, + "learning_rate": 2.2560788114385216e-05, + "loss": 0.8591, + "step": 7492 + }, + { + "epoch": 3.3966455122393473, + "grad_norm": 0.24434066949868113, + "learning_rate": 2.254939664045778e-05, + "loss": 0.8727, + "step": 7493 + }, + { + "epoch": 3.397098821396192, + "grad_norm": 0.2516837246828894, + "learning_rate": 2.253800691425954e-05, + "loss": 0.8645, + "step": 7494 + }, + { + "epoch": 3.397552130553037, + "grad_norm": 0.19671606612033465, + "learning_rate": 2.252661893693125e-05, + "loss": 0.8765, + "step": 7495 + }, + { + "epoch": 3.3980054397098822, + "grad_norm": 0.24083650934037068, + "learning_rate": 2.2515232709613407e-05, + "loss": 0.871, + "step": 7496 + }, + { + "epoch": 3.398458748866727, + "grad_norm": 0.2661652833631995, + "learning_rate": 2.2503848233446415e-05, + "loss": 0.8717, + "step": 7497 + }, + { + "epoch": 3.398912058023572, + "grad_norm": 0.19327705224451974, + "learning_rate": 2.249246550957044e-05, + "loss": 0.8638, + "step": 7498 + }, + { + "epoch": 3.399365367180417, + "grad_norm": 0.18628949831900254, + "learning_rate": 2.248108453912548e-05, + "loss": 0.8732, + "step": 7499 + }, + { + "epoch": 3.399818676337262, + "grad_norm": 0.18369692488757752, + "learning_rate": 2.2469705323251408e-05, + "loss": 0.8787, + "step": 7500 + }, + { + "epoch": 3.400271985494107, + "grad_norm": 0.20167547279819462, + "learning_rate": 2.2458327863087856e-05, + "loss": 0.8531, + "step": 7501 + }, + { + "epoch": 3.400725294650952, + "grad_norm": 0.17934823211631115, + "learning_rate": 2.244695215977433e-05, + "loss": 0.8779, + "step": 7502 + }, + { + "epoch": 3.401178603807797, + "grad_norm": 0.18757977220987573, + "learning_rate": 2.2435578214450127e-05, + "loss": 0.8694, + "step": 7503 + }, + { + "epoch": 3.4016319129646417, + "grad_norm": 0.20371400599829687, + "learning_rate": 2.2424206028254398e-05, + "loss": 0.8497, + "step": 7504 + }, + { + "epoch": 3.402085222121487, + "grad_norm": 0.21166887826962263, + "learning_rate": 2.241283560232607e-05, + "loss": 0.857, + "step": 7505 + }, + { + "epoch": 3.402538531278332, + "grad_norm": 0.22557137424565774, + "learning_rate": 2.2401466937803962e-05, + "loss": 0.8712, + "step": 7506 + }, + { + "epoch": 3.4029918404351767, + "grad_norm": 0.19265723685370417, + "learning_rate": 2.2390100035826662e-05, + "loss": 0.8923, + "step": 7507 + }, + { + "epoch": 3.403445149592022, + "grad_norm": 0.18675757750604674, + "learning_rate": 2.23787348975326e-05, + "loss": 0.8441, + "step": 7508 + }, + { + "epoch": 3.4038984587488668, + "grad_norm": 0.16339257272655064, + "learning_rate": 2.2367371524060005e-05, + "loss": 0.8661, + "step": 7509 + }, + { + "epoch": 3.4043517679057116, + "grad_norm": 0.20310807004788642, + "learning_rate": 2.2356009916546995e-05, + "loss": 0.8449, + "step": 7510 + }, + { + "epoch": 3.404805077062557, + "grad_norm": 0.23596182687487793, + "learning_rate": 2.234465007613143e-05, + "loss": 0.8765, + "step": 7511 + }, + { + "epoch": 3.4052583862194017, + "grad_norm": 0.18430061660226768, + "learning_rate": 2.2333292003951063e-05, + "loss": 0.8618, + "step": 7512 + }, + { + "epoch": 3.4057116953762465, + "grad_norm": 0.23571233820793622, + "learning_rate": 2.232193570114342e-05, + "loss": 0.8782, + "step": 7513 + }, + { + "epoch": 3.406165004533092, + "grad_norm": 0.19154506635183807, + "learning_rate": 2.2310581168845858e-05, + "loss": 0.8813, + "step": 7514 + }, + { + "epoch": 3.4066183136899366, + "grad_norm": 0.19881056403893754, + "learning_rate": 2.2299228408195597e-05, + "loss": 0.8594, + "step": 7515 + }, + { + "epoch": 3.4070716228467814, + "grad_norm": 0.21438882332799422, + "learning_rate": 2.2287877420329615e-05, + "loss": 0.8632, + "step": 7516 + }, + { + "epoch": 3.4075249320036267, + "grad_norm": 0.1864135660103843, + "learning_rate": 2.227652820638478e-05, + "loss": 0.8625, + "step": 7517 + }, + { + "epoch": 3.4079782411604715, + "grad_norm": 0.2041717485446374, + "learning_rate": 2.226518076749772e-05, + "loss": 0.8574, + "step": 7518 + }, + { + "epoch": 3.4084315503173164, + "grad_norm": 0.18349647778942216, + "learning_rate": 2.2253835104804944e-05, + "loss": 0.8591, + "step": 7519 + }, + { + "epoch": 3.408884859474161, + "grad_norm": 0.19687826154870955, + "learning_rate": 2.2242491219442718e-05, + "loss": 0.8593, + "step": 7520 + }, + { + "epoch": 3.4093381686310065, + "grad_norm": 0.1703881281800158, + "learning_rate": 2.22311491125472e-05, + "loss": 0.8688, + "step": 7521 + }, + { + "epoch": 3.4097914777878513, + "grad_norm": 0.22531627541330432, + "learning_rate": 2.2219808785254307e-05, + "loss": 0.8759, + "step": 7522 + }, + { + "epoch": 3.410244786944696, + "grad_norm": 0.15709312436216172, + "learning_rate": 2.2208470238699835e-05, + "loss": 0.8911, + "step": 7523 + }, + { + "epoch": 3.4106980961015414, + "grad_norm": 0.20946311012975138, + "learning_rate": 2.2197133474019354e-05, + "loss": 0.8632, + "step": 7524 + }, + { + "epoch": 3.4111514052583862, + "grad_norm": 0.20054093347366536, + "learning_rate": 2.2185798492348266e-05, + "loss": 0.8587, + "step": 7525 + }, + { + "epoch": 3.411604714415231, + "grad_norm": 0.13993384464595507, + "learning_rate": 2.2174465294821825e-05, + "loss": 0.8598, + "step": 7526 + }, + { + "epoch": 3.4120580235720763, + "grad_norm": 0.2127651584506655, + "learning_rate": 2.2163133882575075e-05, + "loss": 0.8856, + "step": 7527 + }, + { + "epoch": 3.412511332728921, + "grad_norm": 0.1716287329587766, + "learning_rate": 2.215180425674289e-05, + "loss": 0.8714, + "step": 7528 + }, + { + "epoch": 3.412964641885766, + "grad_norm": 0.19515263313282585, + "learning_rate": 2.214047641845995e-05, + "loss": 0.8693, + "step": 7529 + }, + { + "epoch": 3.4134179510426113, + "grad_norm": 0.22533691811542883, + "learning_rate": 2.2129150368860793e-05, + "loss": 0.8421, + "step": 7530 + }, + { + "epoch": 3.413871260199456, + "grad_norm": 0.18575002686018655, + "learning_rate": 2.2117826109079735e-05, + "loss": 0.8695, + "step": 7531 + }, + { + "epoch": 3.414324569356301, + "grad_norm": 0.24348247998780753, + "learning_rate": 2.2106503640250965e-05, + "loss": 0.8559, + "step": 7532 + }, + { + "epoch": 3.4147778785131457, + "grad_norm": 0.19591714037489133, + "learning_rate": 2.2095182963508426e-05, + "loss": 0.8683, + "step": 7533 + }, + { + "epoch": 3.415231187669991, + "grad_norm": 0.245795393989745, + "learning_rate": 2.2083864079985953e-05, + "loss": 0.8831, + "step": 7534 + }, + { + "epoch": 3.415684496826836, + "grad_norm": 0.22895538026283682, + "learning_rate": 2.207254699081713e-05, + "loss": 0.8481, + "step": 7535 + }, + { + "epoch": 3.4161378059836807, + "grad_norm": 0.20481855338518917, + "learning_rate": 2.2061231697135427e-05, + "loss": 0.8584, + "step": 7536 + }, + { + "epoch": 3.416591115140526, + "grad_norm": 0.25919787448378867, + "learning_rate": 2.2049918200074078e-05, + "loss": 0.8516, + "step": 7537 + }, + { + "epoch": 3.4170444242973708, + "grad_norm": 0.22723751521500674, + "learning_rate": 2.203860650076619e-05, + "loss": 0.8695, + "step": 7538 + }, + { + "epoch": 3.4174977334542156, + "grad_norm": 0.23517981350326186, + "learning_rate": 2.2027296600344648e-05, + "loss": 0.8619, + "step": 7539 + }, + { + "epoch": 3.417951042611061, + "grad_norm": 0.2801979712493573, + "learning_rate": 2.2015988499942157e-05, + "loss": 0.8582, + "step": 7540 + }, + { + "epoch": 3.4184043517679057, + "grad_norm": 0.18765723111513163, + "learning_rate": 2.2004682200691288e-05, + "loss": 0.872, + "step": 7541 + }, + { + "epoch": 3.4188576609247505, + "grad_norm": 0.2027335933817281, + "learning_rate": 2.199337770372437e-05, + "loss": 0.8637, + "step": 7542 + }, + { + "epoch": 3.419310970081596, + "grad_norm": 0.21466016396310136, + "learning_rate": 2.1982075010173603e-05, + "loss": 0.862, + "step": 7543 + }, + { + "epoch": 3.4197642792384406, + "grad_norm": 0.23814779528285293, + "learning_rate": 2.1970774121170972e-05, + "loss": 0.8523, + "step": 7544 + }, + { + "epoch": 3.4202175883952854, + "grad_norm": 0.2814140169330891, + "learning_rate": 2.1959475037848323e-05, + "loss": 0.8609, + "step": 7545 + }, + { + "epoch": 3.4206708975521307, + "grad_norm": 0.17861801970021451, + "learning_rate": 2.1948177761337243e-05, + "loss": 0.8506, + "step": 7546 + }, + { + "epoch": 3.4211242067089755, + "grad_norm": 0.2463636506993777, + "learning_rate": 2.1936882292769228e-05, + "loss": 0.8616, + "step": 7547 + }, + { + "epoch": 3.4215775158658204, + "grad_norm": 0.28555354178531417, + "learning_rate": 2.192558863327552e-05, + "loss": 0.8655, + "step": 7548 + }, + { + "epoch": 3.4220308250226656, + "grad_norm": 0.19673859402943472, + "learning_rate": 2.191429678398725e-05, + "loss": 0.8756, + "step": 7549 + }, + { + "epoch": 3.4224841341795105, + "grad_norm": 0.2689886194895122, + "learning_rate": 2.190300674603529e-05, + "loss": 0.868, + "step": 7550 + }, + { + "epoch": 3.4229374433363553, + "grad_norm": 0.18658391675758662, + "learning_rate": 2.1891718520550408e-05, + "loss": 0.8633, + "step": 7551 + }, + { + "epoch": 3.4233907524932006, + "grad_norm": 0.24874595360097349, + "learning_rate": 2.1880432108663118e-05, + "loss": 0.8846, + "step": 7552 + }, + { + "epoch": 3.4238440616500454, + "grad_norm": 0.2852602618979654, + "learning_rate": 2.1869147511503822e-05, + "loss": 0.8625, + "step": 7553 + }, + { + "epoch": 3.4242973708068902, + "grad_norm": 0.20977086814288023, + "learning_rate": 2.1857864730202687e-05, + "loss": 0.8532, + "step": 7554 + }, + { + "epoch": 3.4247506799637355, + "grad_norm": 0.19643557157288696, + "learning_rate": 2.1846583765889707e-05, + "loss": 0.8806, + "step": 7555 + }, + { + "epoch": 3.4252039891205803, + "grad_norm": 0.251353539268238, + "learning_rate": 2.1835304619694727e-05, + "loss": 0.8778, + "step": 7556 + }, + { + "epoch": 3.425657298277425, + "grad_norm": 0.1977923776887601, + "learning_rate": 2.1824027292747357e-05, + "loss": 0.8597, + "step": 7557 + }, + { + "epoch": 3.42611060743427, + "grad_norm": 0.17527931671152153, + "learning_rate": 2.1812751786177087e-05, + "loss": 0.8474, + "step": 7558 + }, + { + "epoch": 3.4265639165911153, + "grad_norm": 0.20850206948582908, + "learning_rate": 2.1801478101113167e-05, + "loss": 0.8553, + "step": 7559 + }, + { + "epoch": 3.42701722574796, + "grad_norm": 0.2417711465447516, + "learning_rate": 2.1790206238684713e-05, + "loss": 0.8757, + "step": 7560 + }, + { + "epoch": 3.427470534904805, + "grad_norm": 0.18479468316740705, + "learning_rate": 2.17789362000206e-05, + "loss": 0.8751, + "step": 7561 + }, + { + "epoch": 3.42792384406165, + "grad_norm": 0.19686289218722064, + "learning_rate": 2.17676679862496e-05, + "loss": 0.8676, + "step": 7562 + }, + { + "epoch": 3.428377153218495, + "grad_norm": 0.20399424265108831, + "learning_rate": 2.175640159850022e-05, + "loss": 0.8642, + "step": 7563 + }, + { + "epoch": 3.42883046237534, + "grad_norm": 0.2077835441553632, + "learning_rate": 2.174513703790085e-05, + "loss": 0.8852, + "step": 7564 + }, + { + "epoch": 3.429283771532185, + "grad_norm": 0.19970637607706057, + "learning_rate": 2.1733874305579656e-05, + "loss": 0.8774, + "step": 7565 + }, + { + "epoch": 3.42973708068903, + "grad_norm": 0.1864189108252847, + "learning_rate": 2.172261340266463e-05, + "loss": 0.854, + "step": 7566 + }, + { + "epoch": 3.4301903898458748, + "grad_norm": 0.19536602490885868, + "learning_rate": 2.1711354330283575e-05, + "loss": 0.8771, + "step": 7567 + }, + { + "epoch": 3.43064369900272, + "grad_norm": 0.17974477861946822, + "learning_rate": 2.170009708956415e-05, + "loss": 0.8591, + "step": 7568 + }, + { + "epoch": 3.431097008159565, + "grad_norm": 0.2132146282140445, + "learning_rate": 2.168884168163378e-05, + "loss": 0.8704, + "step": 7569 + }, + { + "epoch": 3.4315503173164097, + "grad_norm": 0.24420145135499968, + "learning_rate": 2.167758810761972e-05, + "loss": 0.8589, + "step": 7570 + }, + { + "epoch": 3.4320036264732545, + "grad_norm": 0.2194910923764329, + "learning_rate": 2.1666336368649072e-05, + "loss": 0.88, + "step": 7571 + }, + { + "epoch": 3.4324569356301, + "grad_norm": 0.21536208302771778, + "learning_rate": 2.1655086465848704e-05, + "loss": 0.8575, + "step": 7572 + }, + { + "epoch": 3.4329102447869446, + "grad_norm": 0.24690314108794084, + "learning_rate": 2.1643838400345357e-05, + "loss": 0.8625, + "step": 7573 + }, + { + "epoch": 3.4333635539437894, + "grad_norm": 0.25309506472751636, + "learning_rate": 2.1632592173265524e-05, + "loss": 0.8563, + "step": 7574 + }, + { + "epoch": 3.4338168631006347, + "grad_norm": 0.21628365836005467, + "learning_rate": 2.1621347785735586e-05, + "loss": 0.8575, + "step": 7575 + }, + { + "epoch": 3.4342701722574795, + "grad_norm": 0.1882119746422609, + "learning_rate": 2.1610105238881664e-05, + "loss": 0.8633, + "step": 7576 + }, + { + "epoch": 3.4347234814143244, + "grad_norm": 0.23618757158784798, + "learning_rate": 2.1598864533829764e-05, + "loss": 0.8606, + "step": 7577 + }, + { + "epoch": 3.4351767905711696, + "grad_norm": 0.22718519409027466, + "learning_rate": 2.1587625671705643e-05, + "loss": 0.877, + "step": 7578 + }, + { + "epoch": 3.4356300997280145, + "grad_norm": 0.20953803706078505, + "learning_rate": 2.1576388653634942e-05, + "loss": 0.892, + "step": 7579 + }, + { + "epoch": 3.4360834088848593, + "grad_norm": 0.23188512620471546, + "learning_rate": 2.1565153480743066e-05, + "loss": 0.8448, + "step": 7580 + }, + { + "epoch": 3.4365367180417046, + "grad_norm": 0.24001957367344218, + "learning_rate": 2.155392015415523e-05, + "loss": 0.8693, + "step": 7581 + }, + { + "epoch": 3.4369900271985494, + "grad_norm": 0.20454236868014564, + "learning_rate": 2.154268867499652e-05, + "loss": 0.8766, + "step": 7582 + }, + { + "epoch": 3.4374433363553942, + "grad_norm": 0.22499784349363375, + "learning_rate": 2.1531459044391763e-05, + "loss": 0.8639, + "step": 7583 + }, + { + "epoch": 3.4378966455122395, + "grad_norm": 0.24397326055813243, + "learning_rate": 2.1520231263465698e-05, + "loss": 0.8644, + "step": 7584 + }, + { + "epoch": 3.4383499546690843, + "grad_norm": 0.23651929045697453, + "learning_rate": 2.1509005333342746e-05, + "loss": 0.8646, + "step": 7585 + }, + { + "epoch": 3.438803263825929, + "grad_norm": 0.19615226003754233, + "learning_rate": 2.1497781255147273e-05, + "loss": 0.8757, + "step": 7586 + }, + { + "epoch": 3.4392565729827744, + "grad_norm": 0.21529551086885773, + "learning_rate": 2.1486559030003367e-05, + "loss": 0.8838, + "step": 7587 + }, + { + "epoch": 3.4397098821396193, + "grad_norm": 0.17212709423897182, + "learning_rate": 2.1475338659035003e-05, + "loss": 0.8646, + "step": 7588 + }, + { + "epoch": 3.440163191296464, + "grad_norm": 0.21919293684501773, + "learning_rate": 2.1464120143365898e-05, + "loss": 0.8708, + "step": 7589 + }, + { + "epoch": 3.4406165004533094, + "grad_norm": 0.2177308132052484, + "learning_rate": 2.1452903484119652e-05, + "loss": 0.8524, + "step": 7590 + }, + { + "epoch": 3.441069809610154, + "grad_norm": 0.20385299031901108, + "learning_rate": 2.1441688682419618e-05, + "loss": 0.8613, + "step": 7591 + }, + { + "epoch": 3.441523118766999, + "grad_norm": 0.1894404916095677, + "learning_rate": 2.143047573938901e-05, + "loss": 0.8873, + "step": 7592 + }, + { + "epoch": 3.4419764279238443, + "grad_norm": 0.20096816013034421, + "learning_rate": 2.1419264656150825e-05, + "loss": 0.8705, + "step": 7593 + }, + { + "epoch": 3.442429737080689, + "grad_norm": 0.1908786101861783, + "learning_rate": 2.1408055433827907e-05, + "loss": 0.8682, + "step": 7594 + }, + { + "epoch": 3.442883046237534, + "grad_norm": 0.19092044118879808, + "learning_rate": 2.1396848073542876e-05, + "loss": 0.86, + "step": 7595 + }, + { + "epoch": 3.443336355394379, + "grad_norm": 0.2004089965277209, + "learning_rate": 2.138564257641817e-05, + "loss": 0.8836, + "step": 7596 + }, + { + "epoch": 3.443789664551224, + "grad_norm": 0.1849452121177509, + "learning_rate": 2.1374438943576083e-05, + "loss": 0.8648, + "step": 7597 + }, + { + "epoch": 3.444242973708069, + "grad_norm": 0.2015526949587346, + "learning_rate": 2.136323717613866e-05, + "loss": 0.8722, + "step": 7598 + }, + { + "epoch": 3.4446962828649137, + "grad_norm": 0.226744023198571, + "learning_rate": 2.1352037275227818e-05, + "loss": 0.8716, + "step": 7599 + }, + { + "epoch": 3.445149592021759, + "grad_norm": 0.16586865190793604, + "learning_rate": 2.1340839241965235e-05, + "loss": 0.8572, + "step": 7600 + }, + { + "epoch": 3.445602901178604, + "grad_norm": 0.18513491204992716, + "learning_rate": 2.132964307747246e-05, + "loss": 0.8701, + "step": 7601 + }, + { + "epoch": 3.4460562103354486, + "grad_norm": 0.17691645026042493, + "learning_rate": 2.131844878287078e-05, + "loss": 0.8797, + "step": 7602 + }, + { + "epoch": 3.446509519492294, + "grad_norm": 0.17872784287217755, + "learning_rate": 2.1307256359281393e-05, + "loss": 0.8732, + "step": 7603 + }, + { + "epoch": 3.4469628286491387, + "grad_norm": 0.20346857071319144, + "learning_rate": 2.129606580782518e-05, + "loss": 0.8774, + "step": 7604 + }, + { + "epoch": 3.4474161378059835, + "grad_norm": 0.1867491081860655, + "learning_rate": 2.128487712962297e-05, + "loss": 0.8637, + "step": 7605 + }, + { + "epoch": 3.447869446962829, + "grad_norm": 0.19744468719049116, + "learning_rate": 2.12736903257953e-05, + "loss": 0.866, + "step": 7606 + }, + { + "epoch": 3.4483227561196736, + "grad_norm": 0.2111420151612983, + "learning_rate": 2.12625053974626e-05, + "loss": 0.8555, + "step": 7607 + }, + { + "epoch": 3.4487760652765185, + "grad_norm": 0.2130873340637586, + "learning_rate": 2.125132234574503e-05, + "loss": 0.8584, + "step": 7608 + }, + { + "epoch": 3.4492293744333633, + "grad_norm": 0.20000628925481034, + "learning_rate": 2.1240141171762648e-05, + "loss": 0.8694, + "step": 7609 + }, + { + "epoch": 3.4496826835902086, + "grad_norm": 0.20100066364891267, + "learning_rate": 2.122896187663526e-05, + "loss": 0.8765, + "step": 7610 + }, + { + "epoch": 3.4501359927470534, + "grad_norm": 0.23301753025674113, + "learning_rate": 2.121778446148249e-05, + "loss": 0.8574, + "step": 7611 + }, + { + "epoch": 3.4505893019038982, + "grad_norm": 0.1934312961232302, + "learning_rate": 2.1206608927423824e-05, + "loss": 0.8754, + "step": 7612 + }, + { + "epoch": 3.4510426110607435, + "grad_norm": 0.192699365739108, + "learning_rate": 2.1195435275578493e-05, + "loss": 0.8727, + "step": 7613 + }, + { + "epoch": 3.4514959202175883, + "grad_norm": 0.20191425889851564, + "learning_rate": 2.11842635070656e-05, + "loss": 0.8627, + "step": 7614 + }, + { + "epoch": 3.451949229374433, + "grad_norm": 0.2102993173441929, + "learning_rate": 2.1173093623003998e-05, + "loss": 0.8564, + "step": 7615 + }, + { + "epoch": 3.4524025385312784, + "grad_norm": 0.16397675453735583, + "learning_rate": 2.1161925624512415e-05, + "loss": 0.8521, + "step": 7616 + }, + { + "epoch": 3.4528558476881233, + "grad_norm": 0.18829632170960223, + "learning_rate": 2.1150759512709333e-05, + "loss": 0.869, + "step": 7617 + }, + { + "epoch": 3.453309156844968, + "grad_norm": 0.24594892185693426, + "learning_rate": 2.1139595288713094e-05, + "loss": 0.8823, + "step": 7618 + }, + { + "epoch": 3.4537624660018134, + "grad_norm": 0.18230156225850674, + "learning_rate": 2.11284329536418e-05, + "loss": 0.8806, + "step": 7619 + }, + { + "epoch": 3.454215775158658, + "grad_norm": 0.19946390038727027, + "learning_rate": 2.111727250861343e-05, + "loss": 0.8596, + "step": 7620 + }, + { + "epoch": 3.454669084315503, + "grad_norm": 0.1963452669462669, + "learning_rate": 2.110611395474571e-05, + "loss": 0.8698, + "step": 7621 + }, + { + "epoch": 3.4551223934723483, + "grad_norm": 0.2251844356857301, + "learning_rate": 2.1094957293156205e-05, + "loss": 0.8558, + "step": 7622 + }, + { + "epoch": 3.455575702629193, + "grad_norm": 0.22513774593872196, + "learning_rate": 2.1083802524962274e-05, + "loss": 0.8742, + "step": 7623 + }, + { + "epoch": 3.456029011786038, + "grad_norm": 0.22492362137370414, + "learning_rate": 2.107264965128113e-05, + "loss": 0.8676, + "step": 7624 + }, + { + "epoch": 3.456482320942883, + "grad_norm": 0.20827246121507506, + "learning_rate": 2.1061498673229748e-05, + "loss": 0.8742, + "step": 7625 + }, + { + "epoch": 3.456935630099728, + "grad_norm": 0.2844944789584176, + "learning_rate": 2.1050349591924917e-05, + "loss": 0.8622, + "step": 7626 + }, + { + "epoch": 3.457388939256573, + "grad_norm": 0.1958514370340798, + "learning_rate": 2.1039202408483284e-05, + "loss": 0.8709, + "step": 7627 + }, + { + "epoch": 3.457842248413418, + "grad_norm": 0.21079531995528386, + "learning_rate": 2.102805712402124e-05, + "loss": 0.8589, + "step": 7628 + }, + { + "epoch": 3.458295557570263, + "grad_norm": 0.22251511217691333, + "learning_rate": 2.1016913739655046e-05, + "loss": 0.8629, + "step": 7629 + }, + { + "epoch": 3.458748866727108, + "grad_norm": 0.16297282312999897, + "learning_rate": 2.1005772256500717e-05, + "loss": 0.8731, + "step": 7630 + }, + { + "epoch": 3.459202175883953, + "grad_norm": 0.18473549537496742, + "learning_rate": 2.0994632675674128e-05, + "loss": 0.8586, + "step": 7631 + }, + { + "epoch": 3.459655485040798, + "grad_norm": 0.15417386051239354, + "learning_rate": 2.0983494998290928e-05, + "loss": 0.8798, + "step": 7632 + }, + { + "epoch": 3.4601087941976427, + "grad_norm": 0.18685802412134586, + "learning_rate": 2.09723592254666e-05, + "loss": 0.8537, + "step": 7633 + }, + { + "epoch": 3.460562103354488, + "grad_norm": 0.2065173952596365, + "learning_rate": 2.0961225358316404e-05, + "loss": 0.8663, + "step": 7634 + }, + { + "epoch": 3.461015412511333, + "grad_norm": 0.16975549913374968, + "learning_rate": 2.0950093397955457e-05, + "loss": 0.8703, + "step": 7635 + }, + { + "epoch": 3.4614687216681777, + "grad_norm": 0.1924963111585964, + "learning_rate": 2.0938963345498643e-05, + "loss": 0.8665, + "step": 7636 + }, + { + "epoch": 3.4619220308250225, + "grad_norm": 0.17525623636890478, + "learning_rate": 2.0927835202060657e-05, + "loss": 0.8408, + "step": 7637 + }, + { + "epoch": 3.4623753399818678, + "grad_norm": 0.18521270420475852, + "learning_rate": 2.0916708968756038e-05, + "loss": 0.8545, + "step": 7638 + }, + { + "epoch": 3.4628286491387126, + "grad_norm": 0.21098259421195278, + "learning_rate": 2.0905584646699086e-05, + "loss": 0.8627, + "step": 7639 + }, + { + "epoch": 3.4632819582955574, + "grad_norm": 0.164355635484537, + "learning_rate": 2.0894462237003966e-05, + "loss": 0.8552, + "step": 7640 + }, + { + "epoch": 3.4637352674524027, + "grad_norm": 0.20864377568471143, + "learning_rate": 2.0883341740784603e-05, + "loss": 0.8744, + "step": 7641 + }, + { + "epoch": 3.4641885766092475, + "grad_norm": 0.1682048884768107, + "learning_rate": 2.087222315915475e-05, + "loss": 0.8674, + "step": 7642 + }, + { + "epoch": 3.4646418857660923, + "grad_norm": 0.17918097012342793, + "learning_rate": 2.0861106493227945e-05, + "loss": 0.8556, + "step": 7643 + }, + { + "epoch": 3.4650951949229376, + "grad_norm": 0.1819500782627258, + "learning_rate": 2.0849991744117588e-05, + "loss": 0.8775, + "step": 7644 + }, + { + "epoch": 3.4655485040797824, + "grad_norm": 0.19593823648711486, + "learning_rate": 2.0838878912936825e-05, + "loss": 0.8712, + "step": 7645 + }, + { + "epoch": 3.4660018132366273, + "grad_norm": 0.17193266903499133, + "learning_rate": 2.0827768000798664e-05, + "loss": 0.8798, + "step": 7646 + }, + { + "epoch": 3.4664551223934725, + "grad_norm": 0.220190142773648, + "learning_rate": 2.0816659008815873e-05, + "loss": 0.8649, + "step": 7647 + }, + { + "epoch": 3.4669084315503174, + "grad_norm": 0.19771142350662974, + "learning_rate": 2.0805551938101073e-05, + "loss": 0.8953, + "step": 7648 + }, + { + "epoch": 3.467361740707162, + "grad_norm": 0.2604753007149304, + "learning_rate": 2.079444678976664e-05, + "loss": 0.8802, + "step": 7649 + }, + { + "epoch": 3.467815049864007, + "grad_norm": 0.1866907308360328, + "learning_rate": 2.078334356492483e-05, + "loss": 0.8559, + "step": 7650 + }, + { + "epoch": 3.4682683590208523, + "grad_norm": 0.19964368682543487, + "learning_rate": 2.077224226468763e-05, + "loss": 0.8717, + "step": 7651 + }, + { + "epoch": 3.468721668177697, + "grad_norm": 0.22416542213145355, + "learning_rate": 2.0761142890166862e-05, + "loss": 0.8636, + "step": 7652 + }, + { + "epoch": 3.469174977334542, + "grad_norm": 0.2534044277570479, + "learning_rate": 2.0750045442474195e-05, + "loss": 0.8737, + "step": 7653 + }, + { + "epoch": 3.469628286491387, + "grad_norm": 0.2038718119654201, + "learning_rate": 2.0738949922721033e-05, + "loss": 0.8558, + "step": 7654 + }, + { + "epoch": 3.470081595648232, + "grad_norm": 0.19841809957880996, + "learning_rate": 2.0727856332018658e-05, + "loss": 0.8821, + "step": 7655 + }, + { + "epoch": 3.470534904805077, + "grad_norm": 0.2422795248354855, + "learning_rate": 2.07167646714781e-05, + "loss": 0.8595, + "step": 7656 + }, + { + "epoch": 3.470988213961922, + "grad_norm": 0.5223018126219096, + "learning_rate": 2.0705674942210248e-05, + "loss": 0.8624, + "step": 7657 + }, + { + "epoch": 3.471441523118767, + "grad_norm": 0.20100346863251986, + "learning_rate": 2.069458714532574e-05, + "loss": 0.867, + "step": 7658 + }, + { + "epoch": 3.471894832275612, + "grad_norm": 0.20990446009196798, + "learning_rate": 2.068350128193507e-05, + "loss": 0.9009, + "step": 7659 + }, + { + "epoch": 3.472348141432457, + "grad_norm": 0.1854946285399625, + "learning_rate": 2.067241735314852e-05, + "loss": 0.858, + "step": 7660 + }, + { + "epoch": 3.472801450589302, + "grad_norm": 0.18562136128592752, + "learning_rate": 2.0661335360076197e-05, + "loss": 0.8569, + "step": 7661 + }, + { + "epoch": 3.4732547597461467, + "grad_norm": 0.26784831890054833, + "learning_rate": 2.065025530382794e-05, + "loss": 0.8559, + "step": 7662 + }, + { + "epoch": 3.473708068902992, + "grad_norm": 0.21387068974887402, + "learning_rate": 2.06391771855135e-05, + "loss": 0.8795, + "step": 7663 + }, + { + "epoch": 3.474161378059837, + "grad_norm": 0.17802186974381054, + "learning_rate": 2.062810100624235e-05, + "loss": 0.8714, + "step": 7664 + }, + { + "epoch": 3.4746146872166817, + "grad_norm": 0.17966408336121362, + "learning_rate": 2.0617026767123832e-05, + "loss": 0.8702, + "step": 7665 + }, + { + "epoch": 3.475067996373527, + "grad_norm": 0.23216036070697377, + "learning_rate": 2.0605954469267047e-05, + "loss": 0.8469, + "step": 7666 + }, + { + "epoch": 3.4755213055303718, + "grad_norm": 0.38280835367036026, + "learning_rate": 2.0594884113780907e-05, + "loss": 0.8623, + "step": 7667 + }, + { + "epoch": 3.4759746146872166, + "grad_norm": 0.18155779845887676, + "learning_rate": 2.0583815701774168e-05, + "loss": 0.8778, + "step": 7668 + }, + { + "epoch": 3.476427923844062, + "grad_norm": 0.22701807279382943, + "learning_rate": 2.0572749234355335e-05, + "loss": 0.8521, + "step": 7669 + }, + { + "epoch": 3.4768812330009067, + "grad_norm": 0.21094558420661513, + "learning_rate": 2.056168471263278e-05, + "loss": 0.8846, + "step": 7670 + }, + { + "epoch": 3.4773345421577515, + "grad_norm": 0.2699038305627486, + "learning_rate": 2.0550622137714607e-05, + "loss": 0.8836, + "step": 7671 + }, + { + "epoch": 3.477787851314597, + "grad_norm": 0.21864118916725048, + "learning_rate": 2.0539561510708808e-05, + "loss": 0.8774, + "step": 7672 + }, + { + "epoch": 3.4782411604714416, + "grad_norm": 0.20487768401295758, + "learning_rate": 2.0528502832723102e-05, + "loss": 0.8578, + "step": 7673 + }, + { + "epoch": 3.4786944696282864, + "grad_norm": 0.2220692361889258, + "learning_rate": 2.051744610486507e-05, + "loss": 0.8553, + "step": 7674 + }, + { + "epoch": 3.4791477787851317, + "grad_norm": 0.18708166319192882, + "learning_rate": 2.0506391328242065e-05, + "loss": 0.8568, + "step": 7675 + }, + { + "epoch": 3.4796010879419765, + "grad_norm": 0.192717898214449, + "learning_rate": 2.0495338503961267e-05, + "loss": 0.8513, + "step": 7676 + }, + { + "epoch": 3.4800543970988214, + "grad_norm": 0.2165819042160191, + "learning_rate": 2.0484287633129644e-05, + "loss": 0.8615, + "step": 7677 + }, + { + "epoch": 3.480507706255666, + "grad_norm": 0.1930451844297358, + "learning_rate": 2.0473238716853954e-05, + "loss": 0.8703, + "step": 7678 + }, + { + "epoch": 3.4809610154125115, + "grad_norm": 0.2021967165232774, + "learning_rate": 2.046219175624081e-05, + "loss": 0.8761, + "step": 7679 + }, + { + "epoch": 3.4814143245693563, + "grad_norm": 0.24771853711782207, + "learning_rate": 2.045114675239658e-05, + "loss": 0.881, + "step": 7680 + }, + { + "epoch": 3.481867633726201, + "grad_norm": 0.1840040315665036, + "learning_rate": 2.0440103706427442e-05, + "loss": 0.858, + "step": 7681 + }, + { + "epoch": 3.4823209428830464, + "grad_norm": 0.22689109742196983, + "learning_rate": 2.0429062619439423e-05, + "loss": 0.8701, + "step": 7682 + }, + { + "epoch": 3.482774252039891, + "grad_norm": 0.2605702933835505, + "learning_rate": 2.0418023492538298e-05, + "loss": 0.8688, + "step": 7683 + }, + { + "epoch": 3.483227561196736, + "grad_norm": 0.18542049802935504, + "learning_rate": 2.0406986326829653e-05, + "loss": 0.8699, + "step": 7684 + }, + { + "epoch": 3.4836808703535813, + "grad_norm": 0.24536845947311223, + "learning_rate": 2.0395951123418926e-05, + "loss": 0.8903, + "step": 7685 + }, + { + "epoch": 3.484134179510426, + "grad_norm": 0.20794050567928019, + "learning_rate": 2.0384917883411296e-05, + "loss": 0.8611, + "step": 7686 + }, + { + "epoch": 3.484587488667271, + "grad_norm": 0.22884884680131679, + "learning_rate": 2.0373886607911802e-05, + "loss": 0.8405, + "step": 7687 + }, + { + "epoch": 3.485040797824116, + "grad_norm": 0.18729481778663487, + "learning_rate": 2.036285729802523e-05, + "loss": 0.8554, + "step": 7688 + }, + { + "epoch": 3.485494106980961, + "grad_norm": 0.17315122122982485, + "learning_rate": 2.035182995485622e-05, + "loss": 0.8679, + "step": 7689 + }, + { + "epoch": 3.485947416137806, + "grad_norm": 0.18745776032047848, + "learning_rate": 2.0340804579509176e-05, + "loss": 0.8764, + "step": 7690 + }, + { + "epoch": 3.4864007252946507, + "grad_norm": 0.18333588994348235, + "learning_rate": 2.0329781173088348e-05, + "loss": 0.8567, + "step": 7691 + }, + { + "epoch": 3.486854034451496, + "grad_norm": 0.2088465090586286, + "learning_rate": 2.031875973669774e-05, + "loss": 0.8708, + "step": 7692 + }, + { + "epoch": 3.487307343608341, + "grad_norm": 0.19346630841914003, + "learning_rate": 2.0307740271441176e-05, + "loss": 0.862, + "step": 7693 + }, + { + "epoch": 3.4877606527651857, + "grad_norm": 0.20976736667634938, + "learning_rate": 2.0296722778422308e-05, + "loss": 0.8708, + "step": 7694 + }, + { + "epoch": 3.488213961922031, + "grad_norm": 0.1760987066982333, + "learning_rate": 2.028570725874455e-05, + "loss": 0.8668, + "step": 7695 + }, + { + "epoch": 3.4886672710788758, + "grad_norm": 0.22921949500158015, + "learning_rate": 2.027469371351116e-05, + "loss": 0.8881, + "step": 7696 + }, + { + "epoch": 3.4891205802357206, + "grad_norm": 0.20885399961301437, + "learning_rate": 2.0263682143825162e-05, + "loss": 0.8791, + "step": 7697 + }, + { + "epoch": 3.489573889392566, + "grad_norm": 0.14494078852377418, + "learning_rate": 2.0252672550789408e-05, + "loss": 0.8526, + "step": 7698 + }, + { + "epoch": 3.4900271985494107, + "grad_norm": 0.23477963061950505, + "learning_rate": 2.0241664935506533e-05, + "loss": 0.8725, + "step": 7699 + }, + { + "epoch": 3.4904805077062555, + "grad_norm": 0.16810096607247907, + "learning_rate": 2.0230659299078982e-05, + "loss": 0.8506, + "step": 7700 + }, + { + "epoch": 3.490933816863101, + "grad_norm": 0.21074136034045, + "learning_rate": 2.021965564260899e-05, + "loss": 0.8936, + "step": 7701 + }, + { + "epoch": 3.4913871260199456, + "grad_norm": 0.24925124724234132, + "learning_rate": 2.0208653967198638e-05, + "loss": 0.8806, + "step": 7702 + }, + { + "epoch": 3.4918404351767904, + "grad_norm": 0.15585848117563003, + "learning_rate": 2.0197654273949743e-05, + "loss": 0.8579, + "step": 7703 + }, + { + "epoch": 3.4922937443336357, + "grad_norm": 0.2196325120263003, + "learning_rate": 2.0186656563963983e-05, + "loss": 0.8706, + "step": 7704 + }, + { + "epoch": 3.4927470534904805, + "grad_norm": 0.2150146999232804, + "learning_rate": 2.017566083834278e-05, + "loss": 0.8538, + "step": 7705 + }, + { + "epoch": 3.4932003626473254, + "grad_norm": 0.2425815215138058, + "learning_rate": 2.016466709818742e-05, + "loss": 0.8723, + "step": 7706 + }, + { + "epoch": 3.4936536718041706, + "grad_norm": 0.2209610104378104, + "learning_rate": 2.0153675344598937e-05, + "loss": 0.864, + "step": 7707 + }, + { + "epoch": 3.4941069809610155, + "grad_norm": 0.20821776001259257, + "learning_rate": 2.014268557867821e-05, + "loss": 0.8702, + "step": 7708 + }, + { + "epoch": 3.4945602901178603, + "grad_norm": 0.21958358926373048, + "learning_rate": 2.0131697801525876e-05, + "loss": 0.86, + "step": 7709 + }, + { + "epoch": 3.4950135992747056, + "grad_norm": 0.2564736179633967, + "learning_rate": 2.012071201424239e-05, + "loss": 0.8591, + "step": 7710 + }, + { + "epoch": 3.4954669084315504, + "grad_norm": 0.16891844624872382, + "learning_rate": 2.010972821792803e-05, + "loss": 0.8845, + "step": 7711 + }, + { + "epoch": 3.495920217588395, + "grad_norm": 0.21795103756454884, + "learning_rate": 2.009874641368283e-05, + "loss": 0.8747, + "step": 7712 + }, + { + "epoch": 3.4963735267452405, + "grad_norm": 0.18699541039491047, + "learning_rate": 2.0087766602606687e-05, + "loss": 0.8419, + "step": 7713 + }, + { + "epoch": 3.4968268359020853, + "grad_norm": 0.23282387915399091, + "learning_rate": 2.0076788785799222e-05, + "loss": 0.8637, + "step": 7714 + }, + { + "epoch": 3.49728014505893, + "grad_norm": 0.18050765384205564, + "learning_rate": 2.0065812964359926e-05, + "loss": 0.8445, + "step": 7715 + }, + { + "epoch": 3.497733454215775, + "grad_norm": 0.20214551104405679, + "learning_rate": 2.005483913938803e-05, + "loss": 0.8672, + "step": 7716 + }, + { + "epoch": 3.4981867633726202, + "grad_norm": 0.176653190160293, + "learning_rate": 2.0043867311982634e-05, + "loss": 0.8562, + "step": 7717 + }, + { + "epoch": 3.498640072529465, + "grad_norm": 0.22290984991319626, + "learning_rate": 2.003289748324257e-05, + "loss": 0.8575, + "step": 7718 + }, + { + "epoch": 3.49909338168631, + "grad_norm": 0.19670027143795368, + "learning_rate": 2.002192965426651e-05, + "loss": 0.8541, + "step": 7719 + }, + { + "epoch": 3.499546690843155, + "grad_norm": 0.19097007320263673, + "learning_rate": 2.0010963826152895e-05, + "loss": 0.8543, + "step": 7720 + }, + { + "epoch": 3.5, + "grad_norm": 0.21960957409842669, + "learning_rate": 2.0000000000000012e-05, + "loss": 0.8742, + "step": 7721 + }, + { + "epoch": 3.500453309156845, + "grad_norm": 0.21140242001560597, + "learning_rate": 1.998903817690589e-05, + "loss": 0.8728, + "step": 7722 + }, + { + "epoch": 3.50090661831369, + "grad_norm": 0.21439050630822257, + "learning_rate": 1.9978078357968423e-05, + "loss": 0.8716, + "step": 7723 + }, + { + "epoch": 3.501359927470535, + "grad_norm": 0.19503482318541962, + "learning_rate": 1.9967120544285254e-05, + "loss": 0.883, + "step": 7724 + }, + { + "epoch": 3.5018132366273798, + "grad_norm": 0.1949150999295767, + "learning_rate": 1.995616473695382e-05, + "loss": 0.8484, + "step": 7725 + }, + { + "epoch": 3.5022665457842246, + "grad_norm": 0.18321078717462555, + "learning_rate": 1.9945210937071415e-05, + "loss": 0.8659, + "step": 7726 + }, + { + "epoch": 3.50271985494107, + "grad_norm": 0.17364557248504836, + "learning_rate": 1.9934259145735052e-05, + "loss": 0.874, + "step": 7727 + }, + { + "epoch": 3.5031731640979147, + "grad_norm": 0.2225927780176339, + "learning_rate": 1.9923309364041633e-05, + "loss": 0.8727, + "step": 7728 + }, + { + "epoch": 3.5036264732547595, + "grad_norm": 0.17800535094646863, + "learning_rate": 1.9912361593087763e-05, + "loss": 0.8656, + "step": 7729 + }, + { + "epoch": 3.504079782411605, + "grad_norm": 0.18236294905558117, + "learning_rate": 1.990141583396993e-05, + "loss": 0.8852, + "step": 7730 + }, + { + "epoch": 3.5045330915684496, + "grad_norm": 0.21256820983924807, + "learning_rate": 1.989047208778436e-05, + "loss": 0.862, + "step": 7731 + }, + { + "epoch": 3.5049864007252944, + "grad_norm": 0.17020795096984412, + "learning_rate": 1.9879530355627122e-05, + "loss": 0.8498, + "step": 7732 + }, + { + "epoch": 3.5054397098821397, + "grad_norm": 0.2338800217885927, + "learning_rate": 1.986859063859404e-05, + "loss": 0.86, + "step": 7733 + }, + { + "epoch": 3.5058930190389845, + "grad_norm": 0.2001605152223514, + "learning_rate": 1.985765293778078e-05, + "loss": 0.8701, + "step": 7734 + }, + { + "epoch": 3.5063463281958294, + "grad_norm": 0.24860532279867714, + "learning_rate": 1.9846717254282785e-05, + "loss": 0.8656, + "step": 7735 + }, + { + "epoch": 3.5067996373526746, + "grad_norm": 0.23440373042689505, + "learning_rate": 1.9835783589195262e-05, + "loss": 0.881, + "step": 7736 + }, + { + "epoch": 3.5072529465095195, + "grad_norm": 0.19581435730398555, + "learning_rate": 1.9824851943613294e-05, + "loss": 0.8462, + "step": 7737 + }, + { + "epoch": 3.5077062556663643, + "grad_norm": 0.2503116310607518, + "learning_rate": 1.9813922318631698e-05, + "loss": 0.8697, + "step": 7738 + }, + { + "epoch": 3.5081595648232096, + "grad_norm": 0.2429389194128982, + "learning_rate": 1.9802994715345108e-05, + "loss": 0.8551, + "step": 7739 + }, + { + "epoch": 3.5086128739800544, + "grad_norm": 0.22923552197329647, + "learning_rate": 1.9792069134847938e-05, + "loss": 0.8735, + "step": 7740 + }, + { + "epoch": 3.509066183136899, + "grad_norm": 0.18657073568484103, + "learning_rate": 1.978114557823445e-05, + "loss": 0.8813, + "step": 7741 + }, + { + "epoch": 3.5095194922937445, + "grad_norm": 0.2354399904353071, + "learning_rate": 1.9770224046598638e-05, + "loss": 0.8775, + "step": 7742 + }, + { + "epoch": 3.5099728014505893, + "grad_norm": 0.18761494298209472, + "learning_rate": 1.975930454103436e-05, + "loss": 0.8769, + "step": 7743 + }, + { + "epoch": 3.510426110607434, + "grad_norm": 0.21846635053425428, + "learning_rate": 1.97483870626352e-05, + "loss": 0.8622, + "step": 7744 + }, + { + "epoch": 3.5108794197642794, + "grad_norm": 0.15931053811486845, + "learning_rate": 1.973747161249461e-05, + "loss": 0.8271, + "step": 7745 + }, + { + "epoch": 3.5113327289211242, + "grad_norm": 0.19729073970988853, + "learning_rate": 1.9726558191705772e-05, + "loss": 0.8483, + "step": 7746 + }, + { + "epoch": 3.511786038077969, + "grad_norm": 0.18917776133783226, + "learning_rate": 1.9715646801361738e-05, + "loss": 0.8538, + "step": 7747 + }, + { + "epoch": 3.5122393472348143, + "grad_norm": 0.15889289177894, + "learning_rate": 1.9704737442555263e-05, + "loss": 0.8619, + "step": 7748 + }, + { + "epoch": 3.512692656391659, + "grad_norm": 0.21388851187024527, + "learning_rate": 1.9693830116379003e-05, + "loss": 0.8603, + "step": 7749 + }, + { + "epoch": 3.513145965548504, + "grad_norm": 0.16387178744656675, + "learning_rate": 1.9682924823925336e-05, + "loss": 0.8588, + "step": 7750 + }, + { + "epoch": 3.5135992747053493, + "grad_norm": 0.18829656346998083, + "learning_rate": 1.967202156628644e-05, + "loss": 0.8934, + "step": 7751 + }, + { + "epoch": 3.514052583862194, + "grad_norm": 0.23661176249109672, + "learning_rate": 1.9661120344554346e-05, + "loss": 0.884, + "step": 7752 + }, + { + "epoch": 3.514505893019039, + "grad_norm": 0.26198910470950876, + "learning_rate": 1.9650221159820803e-05, + "loss": 0.8664, + "step": 7753 + }, + { + "epoch": 3.514959202175884, + "grad_norm": 0.21834162409988156, + "learning_rate": 1.963932401317744e-05, + "loss": 0.8725, + "step": 7754 + }, + { + "epoch": 3.515412511332729, + "grad_norm": 0.20946525797352314, + "learning_rate": 1.9628428905715596e-05, + "loss": 0.9051, + "step": 7755 + }, + { + "epoch": 3.515865820489574, + "grad_norm": 0.22812769353458962, + "learning_rate": 1.9617535838526486e-05, + "loss": 0.8778, + "step": 7756 + }, + { + "epoch": 3.516319129646419, + "grad_norm": 0.2136733505907888, + "learning_rate": 1.9606644812701062e-05, + "loss": 0.8686, + "step": 7757 + }, + { + "epoch": 3.516772438803264, + "grad_norm": 0.1963597371202185, + "learning_rate": 1.9595755829330097e-05, + "loss": 0.8727, + "step": 7758 + }, + { + "epoch": 3.517225747960109, + "grad_norm": 0.22306381378505907, + "learning_rate": 1.9584868889504133e-05, + "loss": 0.8789, + "step": 7759 + }, + { + "epoch": 3.5176790571169536, + "grad_norm": 0.1895896167459454, + "learning_rate": 1.957398399431357e-05, + "loss": 0.869, + "step": 7760 + }, + { + "epoch": 3.518132366273799, + "grad_norm": 0.1864463103063664, + "learning_rate": 1.956310114484852e-05, + "loss": 0.8924, + "step": 7761 + }, + { + "epoch": 3.5185856754306437, + "grad_norm": 0.16513617669702488, + "learning_rate": 1.9552220342198973e-05, + "loss": 0.8665, + "step": 7762 + }, + { + "epoch": 3.5190389845874885, + "grad_norm": 0.1702654442668788, + "learning_rate": 1.954134158745464e-05, + "loss": 0.8697, + "step": 7763 + }, + { + "epoch": 3.5194922937443334, + "grad_norm": 0.15894840772752147, + "learning_rate": 1.9530464881705083e-05, + "loss": 0.8597, + "step": 7764 + }, + { + "epoch": 3.5199456029011786, + "grad_norm": 0.17367993960643144, + "learning_rate": 1.9519590226039634e-05, + "loss": 0.8501, + "step": 7765 + }, + { + "epoch": 3.5203989120580235, + "grad_norm": 0.15952197555901607, + "learning_rate": 1.950871762154739e-05, + "loss": 0.8293, + "step": 7766 + }, + { + "epoch": 3.5208522212148683, + "grad_norm": 0.1799164603549324, + "learning_rate": 1.949784706931732e-05, + "loss": 0.8792, + "step": 7767 + }, + { + "epoch": 3.5213055303717136, + "grad_norm": 0.18546211328202575, + "learning_rate": 1.9486978570438107e-05, + "loss": 0.8614, + "step": 7768 + }, + { + "epoch": 3.5217588395285584, + "grad_norm": 0.15791583179027474, + "learning_rate": 1.9476112125998293e-05, + "loss": 0.8651, + "step": 7769 + }, + { + "epoch": 3.522212148685403, + "grad_norm": 0.22120419970192226, + "learning_rate": 1.946524773708615e-05, + "loss": 0.8677, + "step": 7770 + }, + { + "epoch": 3.5226654578422485, + "grad_norm": 0.18386555353664574, + "learning_rate": 1.9454385404789812e-05, + "loss": 0.8644, + "step": 7771 + }, + { + "epoch": 3.5231187669990933, + "grad_norm": 0.21248392333301366, + "learning_rate": 1.9443525130197142e-05, + "loss": 0.8461, + "step": 7772 + }, + { + "epoch": 3.523572076155938, + "grad_norm": 0.2122315606132163, + "learning_rate": 1.9432666914395863e-05, + "loss": 0.8751, + "step": 7773 + }, + { + "epoch": 3.5240253853127834, + "grad_norm": 0.20461617219544095, + "learning_rate": 1.942181075847342e-05, + "loss": 0.8891, + "step": 7774 + }, + { + "epoch": 3.5244786944696282, + "grad_norm": 0.20993829900276234, + "learning_rate": 1.9410956663517124e-05, + "loss": 0.8798, + "step": 7775 + }, + { + "epoch": 3.524932003626473, + "grad_norm": 0.2603708095551194, + "learning_rate": 1.940010463061403e-05, + "loss": 0.8747, + "step": 7776 + }, + { + "epoch": 3.5253853127833183, + "grad_norm": 0.24039665654533532, + "learning_rate": 1.9389254660850997e-05, + "loss": 0.8642, + "step": 7777 + }, + { + "epoch": 3.525838621940163, + "grad_norm": 0.19316737469148182, + "learning_rate": 1.937840675531467e-05, + "loss": 0.8614, + "step": 7778 + }, + { + "epoch": 3.526291931097008, + "grad_norm": 0.32295171770826, + "learning_rate": 1.9367560915091524e-05, + "loss": 0.8516, + "step": 7779 + }, + { + "epoch": 3.5267452402538533, + "grad_norm": 0.28417845837292727, + "learning_rate": 1.9356717141267793e-05, + "loss": 0.8522, + "step": 7780 + }, + { + "epoch": 3.527198549410698, + "grad_norm": 0.21214861824934142, + "learning_rate": 1.9345875434929496e-05, + "loss": 0.8844, + "step": 7781 + }, + { + "epoch": 3.527651858567543, + "grad_norm": 0.2871870392284906, + "learning_rate": 1.933503579716249e-05, + "loss": 0.8783, + "step": 7782 + }, + { + "epoch": 3.528105167724388, + "grad_norm": 0.2442518112237541, + "learning_rate": 1.9324198229052364e-05, + "loss": 0.8579, + "step": 7783 + }, + { + "epoch": 3.528558476881233, + "grad_norm": 0.2085532346847097, + "learning_rate": 1.931336273168457e-05, + "loss": 0.8545, + "step": 7784 + }, + { + "epoch": 3.529011786038078, + "grad_norm": 0.21449173476164193, + "learning_rate": 1.9302529306144273e-05, + "loss": 0.8676, + "step": 7785 + }, + { + "epoch": 3.529465095194923, + "grad_norm": 0.19475011283946755, + "learning_rate": 1.9291697953516517e-05, + "loss": 0.8611, + "step": 7786 + }, + { + "epoch": 3.529918404351768, + "grad_norm": 0.18013682324923835, + "learning_rate": 1.9280868674886057e-05, + "loss": 0.8733, + "step": 7787 + }, + { + "epoch": 3.530371713508613, + "grad_norm": 0.17125946926399013, + "learning_rate": 1.92700414713375e-05, + "loss": 0.864, + "step": 7788 + }, + { + "epoch": 3.530825022665458, + "grad_norm": 0.18304420837060537, + "learning_rate": 1.9259216343955205e-05, + "loss": 0.8758, + "step": 7789 + }, + { + "epoch": 3.531278331822303, + "grad_norm": 0.21072244871135523, + "learning_rate": 1.9248393293823357e-05, + "loss": 0.8623, + "step": 7790 + }, + { + "epoch": 3.5317316409791477, + "grad_norm": 0.22718102255236955, + "learning_rate": 1.9237572322025918e-05, + "loss": 0.8582, + "step": 7791 + }, + { + "epoch": 3.532184950135993, + "grad_norm": 0.17878851365887405, + "learning_rate": 1.922675342964661e-05, + "loss": 0.8656, + "step": 7792 + }, + { + "epoch": 3.532638259292838, + "grad_norm": 0.24287873874625307, + "learning_rate": 1.921593661776901e-05, + "loss": 0.8487, + "step": 7793 + }, + { + "epoch": 3.5330915684496826, + "grad_norm": 0.20421790605077098, + "learning_rate": 1.920512188747643e-05, + "loss": 0.862, + "step": 7794 + }, + { + "epoch": 3.533544877606528, + "grad_norm": 0.2232240114051508, + "learning_rate": 1.9194309239852038e-05, + "loss": 0.8603, + "step": 7795 + }, + { + "epoch": 3.5339981867633727, + "grad_norm": 0.21257523447200524, + "learning_rate": 1.9183498675978694e-05, + "loss": 0.8608, + "step": 7796 + }, + { + "epoch": 3.5344514959202176, + "grad_norm": 0.180895270510944, + "learning_rate": 1.9172690196939148e-05, + "loss": 0.8671, + "step": 7797 + }, + { + "epoch": 3.5349048050770624, + "grad_norm": 0.196662527630458, + "learning_rate": 1.9161883803815873e-05, + "loss": 0.8562, + "step": 7798 + }, + { + "epoch": 3.5353581142339077, + "grad_norm": 0.18317690097057104, + "learning_rate": 1.9151079497691192e-05, + "loss": 0.8573, + "step": 7799 + }, + { + "epoch": 3.5358114233907525, + "grad_norm": 0.22032055388759297, + "learning_rate": 1.9140277279647163e-05, + "loss": 0.8862, + "step": 7800 + }, + { + "epoch": 3.5362647325475973, + "grad_norm": 0.19980912907655526, + "learning_rate": 1.9129477150765682e-05, + "loss": 0.8666, + "step": 7801 + }, + { + "epoch": 3.5367180417044426, + "grad_norm": 0.1772357451724393, + "learning_rate": 1.9118679112128388e-05, + "loss": 0.8553, + "step": 7802 + }, + { + "epoch": 3.5371713508612874, + "grad_norm": 0.18527339126713013, + "learning_rate": 1.9107883164816762e-05, + "loss": 0.858, + "step": 7803 + }, + { + "epoch": 3.5376246600181322, + "grad_norm": 0.18742354929519614, + "learning_rate": 1.9097089309912026e-05, + "loss": 0.8653, + "step": 7804 + }, + { + "epoch": 3.538077969174977, + "grad_norm": 0.16614957055171103, + "learning_rate": 1.9086297548495242e-05, + "loss": 0.8498, + "step": 7805 + }, + { + "epoch": 3.5385312783318223, + "grad_norm": 0.16491015798900233, + "learning_rate": 1.9075507881647223e-05, + "loss": 0.8557, + "step": 7806 + }, + { + "epoch": 3.538984587488667, + "grad_norm": 0.17184914463058032, + "learning_rate": 1.906472031044857e-05, + "loss": 0.8632, + "step": 7807 + }, + { + "epoch": 3.539437896645512, + "grad_norm": 0.1913621709609223, + "learning_rate": 1.9053934835979723e-05, + "loss": 0.8908, + "step": 7808 + }, + { + "epoch": 3.5398912058023573, + "grad_norm": 0.1793351045588777, + "learning_rate": 1.9043151459320838e-05, + "loss": 0.8668, + "step": 7809 + }, + { + "epoch": 3.540344514959202, + "grad_norm": 0.20503664593257212, + "learning_rate": 1.903237018155195e-05, + "loss": 0.846, + "step": 7810 + }, + { + "epoch": 3.540797824116047, + "grad_norm": 0.18397241359524244, + "learning_rate": 1.902159100375279e-05, + "loss": 0.8548, + "step": 7811 + }, + { + "epoch": 3.541251133272892, + "grad_norm": 0.35481857699691377, + "learning_rate": 1.9010813927002964e-05, + "loss": 0.8619, + "step": 7812 + }, + { + "epoch": 3.541704442429737, + "grad_norm": 0.23277592665610947, + "learning_rate": 1.9000038952381798e-05, + "loss": 0.8829, + "step": 7813 + }, + { + "epoch": 3.542157751586582, + "grad_norm": 0.22595792699305425, + "learning_rate": 1.898926608096847e-05, + "loss": 0.8691, + "step": 7814 + }, + { + "epoch": 3.542611060743427, + "grad_norm": 0.19235288126998773, + "learning_rate": 1.897849531384187e-05, + "loss": 0.8519, + "step": 7815 + }, + { + "epoch": 3.543064369900272, + "grad_norm": 0.22056360746524736, + "learning_rate": 1.8967726652080758e-05, + "loss": 0.8649, + "step": 7816 + }, + { + "epoch": 3.543517679057117, + "grad_norm": 0.20352119434995755, + "learning_rate": 1.8956960096763616e-05, + "loss": 0.8607, + "step": 7817 + }, + { + "epoch": 3.543970988213962, + "grad_norm": 0.1813285344750851, + "learning_rate": 1.8946195648968785e-05, + "loss": 0.8494, + "step": 7818 + }, + { + "epoch": 3.544424297370807, + "grad_norm": 0.1881611284109176, + "learning_rate": 1.8935433309774316e-05, + "loss": 0.861, + "step": 7819 + }, + { + "epoch": 3.5448776065276517, + "grad_norm": 0.1710087740827164, + "learning_rate": 1.8924673080258128e-05, + "loss": 0.8588, + "step": 7820 + }, + { + "epoch": 3.545330915684497, + "grad_norm": 0.204670984127969, + "learning_rate": 1.8913914961497866e-05, + "loss": 0.8671, + "step": 7821 + }, + { + "epoch": 3.545784224841342, + "grad_norm": 0.16775259044427956, + "learning_rate": 1.890315895457098e-05, + "loss": 0.8739, + "step": 7822 + }, + { + "epoch": 3.5462375339981866, + "grad_norm": 0.19719445384059173, + "learning_rate": 1.889240506055474e-05, + "loss": 0.8813, + "step": 7823 + }, + { + "epoch": 3.546690843155032, + "grad_norm": 0.20016397556569282, + "learning_rate": 1.888165328052616e-05, + "loss": 0.8756, + "step": 7824 + }, + { + "epoch": 3.5471441523118767, + "grad_norm": 0.2186886041600053, + "learning_rate": 1.8870903615562076e-05, + "loss": 0.8708, + "step": 7825 + }, + { + "epoch": 3.5475974614687216, + "grad_norm": 0.13857443903065478, + "learning_rate": 1.8860156066739077e-05, + "loss": 0.8573, + "step": 7826 + }, + { + "epoch": 3.548050770625567, + "grad_norm": 0.22427527495128696, + "learning_rate": 1.8849410635133595e-05, + "loss": 0.8684, + "step": 7827 + }, + { + "epoch": 3.5485040797824117, + "grad_norm": 0.19900729309861628, + "learning_rate": 1.8838667321821786e-05, + "loss": 0.8919, + "step": 7828 + }, + { + "epoch": 3.5489573889392565, + "grad_norm": 0.1817579074357655, + "learning_rate": 1.8827926127879645e-05, + "loss": 0.8653, + "step": 7829 + }, + { + "epoch": 3.5494106980961018, + "grad_norm": 0.21279979844047647, + "learning_rate": 1.881718705438291e-05, + "loss": 0.8738, + "step": 7830 + }, + { + "epoch": 3.5498640072529466, + "grad_norm": 0.17565335582874478, + "learning_rate": 1.8806450102407168e-05, + "loss": 0.8516, + "step": 7831 + }, + { + "epoch": 3.5503173164097914, + "grad_norm": 0.1948419575864782, + "learning_rate": 1.8795715273027736e-05, + "loss": 0.8778, + "step": 7832 + }, + { + "epoch": 3.5507706255666367, + "grad_norm": 0.1718527906798051, + "learning_rate": 1.8784982567319718e-05, + "loss": 0.8706, + "step": 7833 + }, + { + "epoch": 3.5512239347234815, + "grad_norm": 0.1967726756045344, + "learning_rate": 1.877425198635806e-05, + "loss": 0.8546, + "step": 7834 + }, + { + "epoch": 3.5516772438803264, + "grad_norm": 0.16891512547988854, + "learning_rate": 1.8763523531217452e-05, + "loss": 0.8531, + "step": 7835 + }, + { + "epoch": 3.5521305530371716, + "grad_norm": 0.1978340800109562, + "learning_rate": 1.875279720297237e-05, + "loss": 0.8725, + "step": 7836 + }, + { + "epoch": 3.5525838621940165, + "grad_norm": 0.2260739469549671, + "learning_rate": 1.8742073002697076e-05, + "loss": 0.8836, + "step": 7837 + }, + { + "epoch": 3.5530371713508613, + "grad_norm": 0.15530606596903093, + "learning_rate": 1.8731350931465667e-05, + "loss": 0.8684, + "step": 7838 + }, + { + "epoch": 3.553490480507706, + "grad_norm": 0.2315872997034847, + "learning_rate": 1.8720630990351956e-05, + "loss": 0.8823, + "step": 7839 + }, + { + "epoch": 3.5539437896645514, + "grad_norm": 0.16882545257778944, + "learning_rate": 1.8709913180429596e-05, + "loss": 0.8713, + "step": 7840 + }, + { + "epoch": 3.554397098821396, + "grad_norm": 0.21738008063922284, + "learning_rate": 1.8699197502771995e-05, + "loss": 0.8875, + "step": 7841 + }, + { + "epoch": 3.554850407978241, + "grad_norm": 0.2502658402044779, + "learning_rate": 1.868848395845238e-05, + "loss": 0.8559, + "step": 7842 + }, + { + "epoch": 3.555303717135086, + "grad_norm": 0.15354621655936504, + "learning_rate": 1.8677772548543713e-05, + "loss": 0.886, + "step": 7843 + }, + { + "epoch": 3.555757026291931, + "grad_norm": 0.21497518689203812, + "learning_rate": 1.8667063274118806e-05, + "loss": 0.8646, + "step": 7844 + }, + { + "epoch": 3.556210335448776, + "grad_norm": 0.1900848494436193, + "learning_rate": 1.8656356136250192e-05, + "loss": 0.8697, + "step": 7845 + }, + { + "epoch": 3.556663644605621, + "grad_norm": 0.1625172078751201, + "learning_rate": 1.8645651136010255e-05, + "loss": 0.8651, + "step": 7846 + }, + { + "epoch": 3.557116953762466, + "grad_norm": 0.2237986096859722, + "learning_rate": 1.8634948274471118e-05, + "loss": 0.8848, + "step": 7847 + }, + { + "epoch": 3.557570262919311, + "grad_norm": 0.19351361642288545, + "learning_rate": 1.862424755270468e-05, + "loss": 0.8507, + "step": 7848 + }, + { + "epoch": 3.5580235720761557, + "grad_norm": 0.19082189282626652, + "learning_rate": 1.861354897178269e-05, + "loss": 0.8622, + "step": 7849 + }, + { + "epoch": 3.558476881233001, + "grad_norm": 0.21961190517318144, + "learning_rate": 1.8602852532776615e-05, + "loss": 0.8639, + "step": 7850 + }, + { + "epoch": 3.558930190389846, + "grad_norm": 0.1626989413174897, + "learning_rate": 1.859215823675775e-05, + "loss": 0.8654, + "step": 7851 + }, + { + "epoch": 3.5593834995466906, + "grad_norm": 0.20011460701008651, + "learning_rate": 1.8581466084797148e-05, + "loss": 0.871, + "step": 7852 + }, + { + "epoch": 3.559836808703536, + "grad_norm": 0.15676117689917068, + "learning_rate": 1.8570776077965693e-05, + "loss": 0.8741, + "step": 7853 + }, + { + "epoch": 3.5602901178603807, + "grad_norm": 0.18918601847364747, + "learning_rate": 1.856008821733396e-05, + "loss": 0.8801, + "step": 7854 + }, + { + "epoch": 3.5607434270172256, + "grad_norm": 0.19612350711599222, + "learning_rate": 1.8549402503972418e-05, + "loss": 0.8844, + "step": 7855 + }, + { + "epoch": 3.561196736174071, + "grad_norm": 0.19433391032674002, + "learning_rate": 1.853871893895124e-05, + "loss": 0.8726, + "step": 7856 + }, + { + "epoch": 3.5616500453309157, + "grad_norm": 0.17501047354988672, + "learning_rate": 1.8528037523340452e-05, + "loss": 0.8691, + "step": 7857 + }, + { + "epoch": 3.5621033544877605, + "grad_norm": 0.1904622005124782, + "learning_rate": 1.85173582582098e-05, + "loss": 0.867, + "step": 7858 + }, + { + "epoch": 3.5625566636446058, + "grad_norm": 0.19484877540049433, + "learning_rate": 1.850668114462886e-05, + "loss": 0.885, + "step": 7859 + }, + { + "epoch": 3.5630099728014506, + "grad_norm": 0.15934863604075908, + "learning_rate": 1.8496006183666962e-05, + "loss": 0.8943, + "step": 7860 + }, + { + "epoch": 3.5634632819582954, + "grad_norm": 0.18651001862345548, + "learning_rate": 1.848533337639326e-05, + "loss": 0.8788, + "step": 7861 + }, + { + "epoch": 3.5639165911151407, + "grad_norm": 0.2144312323771961, + "learning_rate": 1.8474662723876654e-05, + "loss": 0.8703, + "step": 7862 + }, + { + "epoch": 3.5643699002719855, + "grad_norm": 0.183576947861269, + "learning_rate": 1.8463994227185815e-05, + "loss": 0.8764, + "step": 7863 + }, + { + "epoch": 3.5648232094288304, + "grad_norm": 0.18940004774130778, + "learning_rate": 1.845332788738927e-05, + "loss": 0.8823, + "step": 7864 + }, + { + "epoch": 3.5652765185856756, + "grad_norm": 0.24358915965285427, + "learning_rate": 1.844266370555524e-05, + "loss": 0.8596, + "step": 7865 + }, + { + "epoch": 3.5657298277425205, + "grad_norm": 0.26859786567847255, + "learning_rate": 1.843200168275181e-05, + "loss": 0.8721, + "step": 7866 + }, + { + "epoch": 3.5661831368993653, + "grad_norm": 0.1598314051607467, + "learning_rate": 1.8421341820046787e-05, + "loss": 0.8661, + "step": 7867 + }, + { + "epoch": 3.5666364460562106, + "grad_norm": 0.19320039553185558, + "learning_rate": 1.8410684118507817e-05, + "loss": 0.8773, + "step": 7868 + }, + { + "epoch": 3.5670897552130554, + "grad_norm": 0.2273823776408868, + "learning_rate": 1.840002857920226e-05, + "loss": 0.8743, + "step": 7869 + }, + { + "epoch": 3.5675430643699, + "grad_norm": 0.1740356395338686, + "learning_rate": 1.8389375203197345e-05, + "loss": 0.8793, + "step": 7870 + }, + { + "epoch": 3.5679963735267455, + "grad_norm": 0.16895235905920655, + "learning_rate": 1.8378723991559994e-05, + "loss": 0.8673, + "step": 7871 + }, + { + "epoch": 3.5684496826835903, + "grad_norm": 0.23787733174686743, + "learning_rate": 1.8368074945357013e-05, + "loss": 0.8811, + "step": 7872 + }, + { + "epoch": 3.568902991840435, + "grad_norm": 0.16201609939485964, + "learning_rate": 1.8357428065654874e-05, + "loss": 0.8695, + "step": 7873 + }, + { + "epoch": 3.5693563009972804, + "grad_norm": 0.1704870283174212, + "learning_rate": 1.834678335351993e-05, + "loss": 0.8654, + "step": 7874 + }, + { + "epoch": 3.5698096101541252, + "grad_norm": 0.1994890398511594, + "learning_rate": 1.8336140810018256e-05, + "loss": 0.8589, + "step": 7875 + }, + { + "epoch": 3.57026291931097, + "grad_norm": 0.17637753759789845, + "learning_rate": 1.8325500436215767e-05, + "loss": 0.8642, + "step": 7876 + }, + { + "epoch": 3.570716228467815, + "grad_norm": 0.15163223820018826, + "learning_rate": 1.8314862233178093e-05, + "loss": 0.8603, + "step": 7877 + }, + { + "epoch": 3.57116953762466, + "grad_norm": 0.18104926922954168, + "learning_rate": 1.830422620197071e-05, + "loss": 0.8823, + "step": 7878 + }, + { + "epoch": 3.571622846781505, + "grad_norm": 0.15971587493097156, + "learning_rate": 1.829359234365884e-05, + "loss": 0.855, + "step": 7879 + }, + { + "epoch": 3.57207615593835, + "grad_norm": 0.18282674603847768, + "learning_rate": 1.8282960659307476e-05, + "loss": 0.8563, + "step": 7880 + }, + { + "epoch": 3.572529465095195, + "grad_norm": 0.18817726587611433, + "learning_rate": 1.8272331149981437e-05, + "loss": 0.8597, + "step": 7881 + }, + { + "epoch": 3.57298277425204, + "grad_norm": 0.14523669984763996, + "learning_rate": 1.826170381674528e-05, + "loss": 0.8684, + "step": 7882 + }, + { + "epoch": 3.5734360834088847, + "grad_norm": 0.17609776345106565, + "learning_rate": 1.8251078660663383e-05, + "loss": 0.8743, + "step": 7883 + }, + { + "epoch": 3.5738893925657296, + "grad_norm": 0.18048080085025123, + "learning_rate": 1.824045568279987e-05, + "loss": 0.8744, + "step": 7884 + }, + { + "epoch": 3.574342701722575, + "grad_norm": 0.17266207206803902, + "learning_rate": 1.822983488421868e-05, + "loss": 0.8596, + "step": 7885 + }, + { + "epoch": 3.5747960108794197, + "grad_norm": 0.1426709225459361, + "learning_rate": 1.8219216265983494e-05, + "loss": 0.8674, + "step": 7886 + }, + { + "epoch": 3.5752493200362645, + "grad_norm": 0.18030543477947478, + "learning_rate": 1.8208599829157826e-05, + "loss": 0.8761, + "step": 7887 + }, + { + "epoch": 3.5757026291931098, + "grad_norm": 0.15742200264915723, + "learning_rate": 1.8197985574804927e-05, + "loss": 0.8697, + "step": 7888 + }, + { + "epoch": 3.5761559383499546, + "grad_norm": 0.16684499397492691, + "learning_rate": 1.8187373503987837e-05, + "loss": 0.8644, + "step": 7889 + }, + { + "epoch": 3.5766092475067994, + "grad_norm": 0.18158353307330655, + "learning_rate": 1.8176763617769406e-05, + "loss": 0.8597, + "step": 7890 + }, + { + "epoch": 3.5770625566636447, + "grad_norm": 0.1836262296386174, + "learning_rate": 1.8166155917212242e-05, + "loss": 0.8697, + "step": 7891 + }, + { + "epoch": 3.5775158658204895, + "grad_norm": 0.17441594585558243, + "learning_rate": 1.8155550403378712e-05, + "loss": 0.8466, + "step": 7892 + }, + { + "epoch": 3.5779691749773344, + "grad_norm": 0.20936430190464006, + "learning_rate": 1.8144947077331027e-05, + "loss": 0.8557, + "step": 7893 + }, + { + "epoch": 3.5784224841341796, + "grad_norm": 0.1476812345752682, + "learning_rate": 1.813434594013112e-05, + "loss": 0.8453, + "step": 7894 + }, + { + "epoch": 3.5788757932910245, + "grad_norm": 0.19119905703819437, + "learning_rate": 1.8123746992840714e-05, + "loss": 0.8771, + "step": 7895 + }, + { + "epoch": 3.5793291024478693, + "grad_norm": 0.16677136589486224, + "learning_rate": 1.8113150236521358e-05, + "loss": 0.8646, + "step": 7896 + }, + { + "epoch": 3.5797824116047146, + "grad_norm": 0.16404769025653995, + "learning_rate": 1.810255567223431e-05, + "loss": 0.8506, + "step": 7897 + }, + { + "epoch": 3.5802357207615594, + "grad_norm": 0.16754799816391774, + "learning_rate": 1.8091963301040688e-05, + "loss": 0.876, + "step": 7898 + }, + { + "epoch": 3.580689029918404, + "grad_norm": 0.16348319533841688, + "learning_rate": 1.8081373124001313e-05, + "loss": 0.8651, + "step": 7899 + }, + { + "epoch": 3.5811423390752495, + "grad_norm": 0.13835896013158888, + "learning_rate": 1.8070785142176847e-05, + "loss": 0.8612, + "step": 7900 + }, + { + "epoch": 3.5815956482320943, + "grad_norm": 0.16905024767742888, + "learning_rate": 1.8060199356627682e-05, + "loss": 0.8863, + "step": 7901 + }, + { + "epoch": 3.582048957388939, + "grad_norm": 0.13502962639350827, + "learning_rate": 1.804961576841405e-05, + "loss": 0.8744, + "step": 7902 + }, + { + "epoch": 3.5825022665457844, + "grad_norm": 0.18221246582298622, + "learning_rate": 1.8039034378595898e-05, + "loss": 0.8699, + "step": 7903 + }, + { + "epoch": 3.5829555757026292, + "grad_norm": 0.17072729061530495, + "learning_rate": 1.8028455188233003e-05, + "loss": 0.8788, + "step": 7904 + }, + { + "epoch": 3.583408884859474, + "grad_norm": 0.17107369266365866, + "learning_rate": 1.8017878198384897e-05, + "loss": 0.8883, + "step": 7905 + }, + { + "epoch": 3.5838621940163193, + "grad_norm": 0.16033148077739845, + "learning_rate": 1.800730341011088e-05, + "loss": 0.8842, + "step": 7906 + }, + { + "epoch": 3.584315503173164, + "grad_norm": 0.16958901561495893, + "learning_rate": 1.799673082447008e-05, + "loss": 0.8519, + "step": 7907 + }, + { + "epoch": 3.584768812330009, + "grad_norm": 0.175229286106887, + "learning_rate": 1.7986160442521333e-05, + "loss": 0.8684, + "step": 7908 + }, + { + "epoch": 3.5852221214868543, + "grad_norm": 0.20163858939342325, + "learning_rate": 1.7975592265323332e-05, + "loss": 0.886, + "step": 7909 + }, + { + "epoch": 3.585675430643699, + "grad_norm": 0.15731842693487647, + "learning_rate": 1.7965026293934493e-05, + "loss": 0.8472, + "step": 7910 + }, + { + "epoch": 3.586128739800544, + "grad_norm": 0.21916975324628035, + "learning_rate": 1.7954462529413032e-05, + "loss": 0.8653, + "step": 7911 + }, + { + "epoch": 3.586582048957389, + "grad_norm": 0.15007466328742503, + "learning_rate": 1.7943900972816926e-05, + "loss": 0.8514, + "step": 7912 + }, + { + "epoch": 3.587035358114234, + "grad_norm": 0.1894149684184324, + "learning_rate": 1.7933341625203972e-05, + "loss": 0.8467, + "step": 7913 + }, + { + "epoch": 3.587488667271079, + "grad_norm": 0.15828874899348394, + "learning_rate": 1.792278448763169e-05, + "loss": 0.87, + "step": 7914 + }, + { + "epoch": 3.587941976427924, + "grad_norm": 0.18252985010726203, + "learning_rate": 1.7912229561157444e-05, + "loss": 0.8689, + "step": 7915 + }, + { + "epoch": 3.588395285584769, + "grad_norm": 0.19983547194998527, + "learning_rate": 1.7901676846838305e-05, + "loss": 0.8541, + "step": 7916 + }, + { + "epoch": 3.5888485947416138, + "grad_norm": 0.1901724872570542, + "learning_rate": 1.7891126345731195e-05, + "loss": 0.8551, + "step": 7917 + }, + { + "epoch": 3.5893019038984586, + "grad_norm": 0.17059496588677145, + "learning_rate": 1.788057805889274e-05, + "loss": 0.8413, + "step": 7918 + }, + { + "epoch": 3.589755213055304, + "grad_norm": 0.17159371790761535, + "learning_rate": 1.7870031987379413e-05, + "loss": 0.8508, + "step": 7919 + }, + { + "epoch": 3.5902085222121487, + "grad_norm": 0.18287314245267283, + "learning_rate": 1.785948813224742e-05, + "loss": 0.8562, + "step": 7920 + }, + { + "epoch": 3.5906618313689935, + "grad_norm": 0.1633411851528427, + "learning_rate": 1.784894649455275e-05, + "loss": 0.8579, + "step": 7921 + }, + { + "epoch": 3.5911151405258384, + "grad_norm": 0.19917288388300186, + "learning_rate": 1.7838407075351205e-05, + "loss": 0.8846, + "step": 7922 + }, + { + "epoch": 3.5915684496826836, + "grad_norm": 0.16749809134904045, + "learning_rate": 1.7827869875698302e-05, + "loss": 0.8696, + "step": 7923 + }, + { + "epoch": 3.5920217588395285, + "grad_norm": 0.20971190234556583, + "learning_rate": 1.7817334896649416e-05, + "loss": 0.8652, + "step": 7924 + }, + { + "epoch": 3.5924750679963733, + "grad_norm": 0.19184163827901518, + "learning_rate": 1.7806802139259612e-05, + "loss": 0.8643, + "step": 7925 + }, + { + "epoch": 3.5929283771532186, + "grad_norm": 0.16712209823309648, + "learning_rate": 1.779627160458382e-05, + "loss": 0.8927, + "step": 7926 + }, + { + "epoch": 3.5933816863100634, + "grad_norm": 0.1953734286388986, + "learning_rate": 1.778574329367666e-05, + "loss": 0.8593, + "step": 7927 + }, + { + "epoch": 3.593834995466908, + "grad_norm": 0.17195240517343016, + "learning_rate": 1.777521720759262e-05, + "loss": 0.8721, + "step": 7928 + }, + { + "epoch": 3.5942883046237535, + "grad_norm": 0.22102011859468626, + "learning_rate": 1.7764693347385868e-05, + "loss": 0.8671, + "step": 7929 + }, + { + "epoch": 3.5947416137805983, + "grad_norm": 0.18394907746250788, + "learning_rate": 1.7754171714110463e-05, + "loss": 0.8533, + "step": 7930 + }, + { + "epoch": 3.595194922937443, + "grad_norm": 0.1937066318444046, + "learning_rate": 1.774365230882011e-05, + "loss": 0.868, + "step": 7931 + }, + { + "epoch": 3.5956482320942884, + "grad_norm": 0.21559139040056582, + "learning_rate": 1.7733135132568402e-05, + "loss": 0.8474, + "step": 7932 + }, + { + "epoch": 3.5961015412511332, + "grad_norm": 0.17683996671416838, + "learning_rate": 1.7722620186408638e-05, + "loss": 0.8461, + "step": 7933 + }, + { + "epoch": 3.596554850407978, + "grad_norm": 0.2136419036899005, + "learning_rate": 1.7712107471393946e-05, + "loss": 0.8577, + "step": 7934 + }, + { + "epoch": 3.5970081595648233, + "grad_norm": 0.18396219674178255, + "learning_rate": 1.7701596988577193e-05, + "loss": 0.8662, + "step": 7935 + }, + { + "epoch": 3.597461468721668, + "grad_norm": 0.17749740369851175, + "learning_rate": 1.7691088739011023e-05, + "loss": 0.8824, + "step": 7936 + }, + { + "epoch": 3.597914777878513, + "grad_norm": 0.1875961549592077, + "learning_rate": 1.7680582723747894e-05, + "loss": 0.8499, + "step": 7937 + }, + { + "epoch": 3.5983680870353583, + "grad_norm": 0.19578353107418553, + "learning_rate": 1.767007894383999e-05, + "loss": 0.8792, + "step": 7938 + }, + { + "epoch": 3.598821396192203, + "grad_norm": 0.1593338664157306, + "learning_rate": 1.7659577400339318e-05, + "loss": 0.876, + "step": 7939 + }, + { + "epoch": 3.599274705349048, + "grad_norm": 0.20879796731032485, + "learning_rate": 1.764907809429761e-05, + "loss": 0.8597, + "step": 7940 + }, + { + "epoch": 3.599728014505893, + "grad_norm": 0.1993883476256691, + "learning_rate": 1.7638581026766436e-05, + "loss": 0.8996, + "step": 7941 + }, + { + "epoch": 3.600181323662738, + "grad_norm": 0.1944162508950195, + "learning_rate": 1.7628086198797077e-05, + "loss": 0.8567, + "step": 7942 + }, + { + "epoch": 3.600634632819583, + "grad_norm": 0.20424297759713939, + "learning_rate": 1.761759361144065e-05, + "loss": 0.8664, + "step": 7943 + }, + { + "epoch": 3.601087941976428, + "grad_norm": 0.18158389757262183, + "learning_rate": 1.7607103265747986e-05, + "loss": 0.8764, + "step": 7944 + }, + { + "epoch": 3.601541251133273, + "grad_norm": 0.17520798809260596, + "learning_rate": 1.759661516276976e-05, + "loss": 0.849, + "step": 7945 + }, + { + "epoch": 3.6019945602901178, + "grad_norm": 0.2196086294665604, + "learning_rate": 1.7586129303556364e-05, + "loss": 0.8857, + "step": 7946 + }, + { + "epoch": 3.602447869446963, + "grad_norm": 0.1677622141316975, + "learning_rate": 1.7575645689157978e-05, + "loss": 0.8883, + "step": 7947 + }, + { + "epoch": 3.602901178603808, + "grad_norm": 0.21179810707922275, + "learning_rate": 1.7565164320624597e-05, + "loss": 0.8674, + "step": 7948 + }, + { + "epoch": 3.6033544877606527, + "grad_norm": 0.19299393075470642, + "learning_rate": 1.7554685199005945e-05, + "loss": 0.8632, + "step": 7949 + }, + { + "epoch": 3.603807796917498, + "grad_norm": 0.19627017122572196, + "learning_rate": 1.754420832535153e-05, + "loss": 0.8696, + "step": 7950 + }, + { + "epoch": 3.604261106074343, + "grad_norm": 0.18566774460353244, + "learning_rate": 1.7533733700710636e-05, + "loss": 0.8649, + "step": 7951 + }, + { + "epoch": 3.6047144152311876, + "grad_norm": 0.18907024726033297, + "learning_rate": 1.7523261326132352e-05, + "loss": 0.8752, + "step": 7952 + }, + { + "epoch": 3.605167724388033, + "grad_norm": 0.19857504121188743, + "learning_rate": 1.7512791202665492e-05, + "loss": 0.8735, + "step": 7953 + }, + { + "epoch": 3.6056210335448777, + "grad_norm": 0.19615994676676593, + "learning_rate": 1.7502323331358696e-05, + "loss": 0.8888, + "step": 7954 + }, + { + "epoch": 3.6060743427017226, + "grad_norm": 0.2102305565008125, + "learning_rate": 1.7491857713260322e-05, + "loss": 0.8734, + "step": 7955 + }, + { + "epoch": 3.6065276518585674, + "grad_norm": 0.1813678975995554, + "learning_rate": 1.7481394349418565e-05, + "loss": 0.8785, + "step": 7956 + }, + { + "epoch": 3.6069809610154127, + "grad_norm": 0.1918944805072537, + "learning_rate": 1.747093324088133e-05, + "loss": 0.8612, + "step": 7957 + }, + { + "epoch": 3.6074342701722575, + "grad_norm": 0.23361003978383568, + "learning_rate": 1.7460474388696354e-05, + "loss": 0.8741, + "step": 7958 + }, + { + "epoch": 3.6078875793291023, + "grad_norm": 0.16262407696269768, + "learning_rate": 1.7450017793911097e-05, + "loss": 0.8609, + "step": 7959 + }, + { + "epoch": 3.608340888485947, + "grad_norm": 0.20190662123390982, + "learning_rate": 1.7439563457572846e-05, + "loss": 0.8708, + "step": 7960 + }, + { + "epoch": 3.6087941976427924, + "grad_norm": 0.1672739121136691, + "learning_rate": 1.742911138072862e-05, + "loss": 0.8764, + "step": 7961 + }, + { + "epoch": 3.6092475067996372, + "grad_norm": 0.19697890738035612, + "learning_rate": 1.741866156442521e-05, + "loss": 0.8743, + "step": 7962 + }, + { + "epoch": 3.609700815956482, + "grad_norm": 0.1640735435018243, + "learning_rate": 1.7408214009709222e-05, + "loss": 0.8643, + "step": 7963 + }, + { + "epoch": 3.6101541251133273, + "grad_norm": 0.1673464444974834, + "learning_rate": 1.739776871762698e-05, + "loss": 0.8623, + "step": 7964 + }, + { + "epoch": 3.610607434270172, + "grad_norm": 0.183649335449492, + "learning_rate": 1.7387325689224645e-05, + "loss": 0.8778, + "step": 7965 + }, + { + "epoch": 3.611060743427017, + "grad_norm": 0.19727896062094838, + "learning_rate": 1.737688492554809e-05, + "loss": 0.8815, + "step": 7966 + }, + { + "epoch": 3.6115140525838623, + "grad_norm": 0.1837464439044281, + "learning_rate": 1.7366446427643008e-05, + "loss": 0.8657, + "step": 7967 + }, + { + "epoch": 3.611967361740707, + "grad_norm": 0.22327064056289678, + "learning_rate": 1.735601019655483e-05, + "loss": 0.8671, + "step": 7968 + }, + { + "epoch": 3.612420670897552, + "grad_norm": 0.17289093229103084, + "learning_rate": 1.7345576233328788e-05, + "loss": 0.8654, + "step": 7969 + }, + { + "epoch": 3.612873980054397, + "grad_norm": 0.1586114628918905, + "learning_rate": 1.733514453900985e-05, + "loss": 0.8731, + "step": 7970 + }, + { + "epoch": 3.613327289211242, + "grad_norm": 0.19248983822980886, + "learning_rate": 1.7324715114642807e-05, + "loss": 0.8698, + "step": 7971 + }, + { + "epoch": 3.613780598368087, + "grad_norm": 0.5164650168306827, + "learning_rate": 1.7314287961272178e-05, + "loss": 0.8766, + "step": 7972 + }, + { + "epoch": 3.614233907524932, + "grad_norm": 0.17930781160772497, + "learning_rate": 1.7303863079942296e-05, + "loss": 0.8745, + "step": 7973 + }, + { + "epoch": 3.614687216681777, + "grad_norm": 0.1706410101110211, + "learning_rate": 1.729344047169721e-05, + "loss": 0.8697, + "step": 7974 + }, + { + "epoch": 3.6151405258386218, + "grad_norm": 0.19594512672439762, + "learning_rate": 1.7283020137580812e-05, + "loss": 0.8602, + "step": 7975 + }, + { + "epoch": 3.615593834995467, + "grad_norm": 0.1800660864470222, + "learning_rate": 1.7272602078636713e-05, + "loss": 0.8684, + "step": 7976 + }, + { + "epoch": 3.616047144152312, + "grad_norm": 0.22396224199237494, + "learning_rate": 1.726218629590829e-05, + "loss": 0.8643, + "step": 7977 + }, + { + "epoch": 3.6165004533091567, + "grad_norm": 0.16952946546914363, + "learning_rate": 1.7251772790438752e-05, + "loss": 0.8607, + "step": 7978 + }, + { + "epoch": 3.616953762466002, + "grad_norm": 0.17694721390806475, + "learning_rate": 1.7241361563271012e-05, + "loss": 0.859, + "step": 7979 + }, + { + "epoch": 3.617407071622847, + "grad_norm": 0.1729949068743132, + "learning_rate": 1.7230952615447814e-05, + "loss": 0.8585, + "step": 7980 + }, + { + "epoch": 3.6178603807796916, + "grad_norm": 0.1557944849186986, + "learning_rate": 1.722054594801161e-05, + "loss": 0.8718, + "step": 7981 + }, + { + "epoch": 3.618313689936537, + "grad_norm": 0.17909252107312784, + "learning_rate": 1.7210141562004696e-05, + "loss": 0.8676, + "step": 7982 + }, + { + "epoch": 3.6187669990933817, + "grad_norm": 0.1678038917324217, + "learning_rate": 1.7199739458469062e-05, + "loss": 0.8768, + "step": 7983 + }, + { + "epoch": 3.6192203082502266, + "grad_norm": 0.16479737384984816, + "learning_rate": 1.7189339638446546e-05, + "loss": 0.854, + "step": 7984 + }, + { + "epoch": 3.619673617407072, + "grad_norm": 0.19589664115599273, + "learning_rate": 1.7178942102978692e-05, + "loss": 0.8523, + "step": 7985 + }, + { + "epoch": 3.6201269265639167, + "grad_norm": 0.14796775680405558, + "learning_rate": 1.716854685310687e-05, + "loss": 0.8624, + "step": 7986 + }, + { + "epoch": 3.6205802357207615, + "grad_norm": 0.19549750342700647, + "learning_rate": 1.7158153889872183e-05, + "loss": 0.8686, + "step": 7987 + }, + { + "epoch": 3.6210335448776068, + "grad_norm": 0.15909673729378518, + "learning_rate": 1.7147763214315515e-05, + "loss": 0.8874, + "step": 7988 + }, + { + "epoch": 3.6214868540344516, + "grad_norm": 0.15368991472633173, + "learning_rate": 1.7137374827477503e-05, + "loss": 0.8751, + "step": 7989 + }, + { + "epoch": 3.6219401631912964, + "grad_norm": 0.17542092383600458, + "learning_rate": 1.7126988730398612e-05, + "loss": 0.8692, + "step": 7990 + }, + { + "epoch": 3.6223934723481417, + "grad_norm": 0.18073259874031872, + "learning_rate": 1.7116604924119023e-05, + "loss": 0.8743, + "step": 7991 + }, + { + "epoch": 3.6228467815049865, + "grad_norm": 0.19267791946988083, + "learning_rate": 1.7106223409678685e-05, + "loss": 0.8682, + "step": 7992 + }, + { + "epoch": 3.6233000906618313, + "grad_norm": 0.16009694184459122, + "learning_rate": 1.7095844188117375e-05, + "loss": 0.8523, + "step": 7993 + }, + { + "epoch": 3.6237533998186766, + "grad_norm": 0.2313318108118491, + "learning_rate": 1.7085467260474572e-05, + "loss": 0.8823, + "step": 7994 + }, + { + "epoch": 3.6242067089755214, + "grad_norm": 0.17732653946402369, + "learning_rate": 1.7075092627789572e-05, + "loss": 0.8583, + "step": 7995 + }, + { + "epoch": 3.6246600181323663, + "grad_norm": 0.20125598920311585, + "learning_rate": 1.7064720291101413e-05, + "loss": 0.8687, + "step": 7996 + }, + { + "epoch": 3.625113327289211, + "grad_norm": 0.22273594383176484, + "learning_rate": 1.7054350251448937e-05, + "loss": 0.8775, + "step": 7997 + }, + { + "epoch": 3.6255666364460564, + "grad_norm": 0.17504609334601043, + "learning_rate": 1.7043982509870702e-05, + "loss": 0.8639, + "step": 7998 + }, + { + "epoch": 3.626019945602901, + "grad_norm": 0.20427926446637135, + "learning_rate": 1.7033617067405105e-05, + "loss": 0.8876, + "step": 7999 + }, + { + "epoch": 3.626473254759746, + "grad_norm": 0.21706333999612942, + "learning_rate": 1.7023253925090237e-05, + "loss": 0.8765, + "step": 8000 + }, + { + "epoch": 3.626926563916591, + "grad_norm": 0.18551439353722377, + "learning_rate": 1.7012893083964027e-05, + "loss": 0.8622, + "step": 8001 + }, + { + "epoch": 3.627379873073436, + "grad_norm": 0.21529365255768626, + "learning_rate": 1.700253454506414e-05, + "loss": 0.8808, + "step": 8002 + }, + { + "epoch": 3.627833182230281, + "grad_norm": 0.2015109702648719, + "learning_rate": 1.6992178309427986e-05, + "loss": 0.8625, + "step": 8003 + }, + { + "epoch": 3.628286491387126, + "grad_norm": 0.1902064699195901, + "learning_rate": 1.698182437809281e-05, + "loss": 0.8618, + "step": 8004 + }, + { + "epoch": 3.628739800543971, + "grad_norm": 0.17934481837581512, + "learning_rate": 1.697147275209556e-05, + "loss": 0.8514, + "step": 8005 + }, + { + "epoch": 3.629193109700816, + "grad_norm": 0.1837208500110439, + "learning_rate": 1.6961123432473016e-05, + "loss": 0.8691, + "step": 8006 + }, + { + "epoch": 3.6296464188576607, + "grad_norm": 0.17169099245220992, + "learning_rate": 1.6950776420261646e-05, + "loss": 0.8787, + "step": 8007 + }, + { + "epoch": 3.630099728014506, + "grad_norm": 0.21606019085563788, + "learning_rate": 1.694043171649777e-05, + "loss": 0.8676, + "step": 8008 + }, + { + "epoch": 3.630553037171351, + "grad_norm": 0.18420676176140038, + "learning_rate": 1.6930089322217417e-05, + "loss": 0.854, + "step": 8009 + }, + { + "epoch": 3.6310063463281956, + "grad_norm": 0.17207363152895538, + "learning_rate": 1.6919749238456435e-05, + "loss": 0.8596, + "step": 8010 + }, + { + "epoch": 3.631459655485041, + "grad_norm": 0.22341014116517013, + "learning_rate": 1.6909411466250375e-05, + "loss": 0.8704, + "step": 8011 + }, + { + "epoch": 3.6319129646418857, + "grad_norm": 0.19015100270616217, + "learning_rate": 1.6899076006634643e-05, + "loss": 0.8843, + "step": 8012 + }, + { + "epoch": 3.6323662737987306, + "grad_norm": 0.15182482374378478, + "learning_rate": 1.688874286064432e-05, + "loss": 0.8691, + "step": 8013 + }, + { + "epoch": 3.632819582955576, + "grad_norm": 0.20497936061045224, + "learning_rate": 1.687841202931434e-05, + "loss": 0.8786, + "step": 8014 + }, + { + "epoch": 3.6332728921124207, + "grad_norm": 0.20929298273402383, + "learning_rate": 1.6868083513679335e-05, + "loss": 0.8638, + "step": 8015 + }, + { + "epoch": 3.6337262012692655, + "grad_norm": 0.17625376228593043, + "learning_rate": 1.6857757314773763e-05, + "loss": 0.8757, + "step": 8016 + }, + { + "epoch": 3.6341795104261108, + "grad_norm": 0.1854350280874724, + "learning_rate": 1.6847433433631806e-05, + "loss": 0.8548, + "step": 8017 + }, + { + "epoch": 3.6346328195829556, + "grad_norm": 0.20536086861127179, + "learning_rate": 1.6837111871287426e-05, + "loss": 0.865, + "step": 8018 + }, + { + "epoch": 3.6350861287398004, + "grad_norm": 0.17027510603798812, + "learning_rate": 1.682679262877438e-05, + "loss": 0.8961, + "step": 8019 + }, + { + "epoch": 3.6355394378966457, + "grad_norm": 0.1737690421785, + "learning_rate": 1.681647570712614e-05, + "loss": 0.8504, + "step": 8020 + }, + { + "epoch": 3.6359927470534905, + "grad_norm": 0.1807629577177738, + "learning_rate": 1.680616110737601e-05, + "loss": 0.8471, + "step": 8021 + }, + { + "epoch": 3.6364460562103353, + "grad_norm": 0.1487665386927012, + "learning_rate": 1.6795848830556995e-05, + "loss": 0.8658, + "step": 8022 + }, + { + "epoch": 3.6368993653671806, + "grad_norm": 0.1696103054879212, + "learning_rate": 1.678553887770193e-05, + "loss": 0.8383, + "step": 8023 + }, + { + "epoch": 3.6373526745240254, + "grad_norm": 0.19549690671085967, + "learning_rate": 1.677523124984336e-05, + "loss": 0.8641, + "step": 8024 + }, + { + "epoch": 3.6378059836808703, + "grad_norm": 0.1560645196606542, + "learning_rate": 1.6764925948013645e-05, + "loss": 0.8721, + "step": 8025 + }, + { + "epoch": 3.6382592928377155, + "grad_norm": 0.18173588659502452, + "learning_rate": 1.675462297324489e-05, + "loss": 0.8741, + "step": 8026 + }, + { + "epoch": 3.6387126019945604, + "grad_norm": 0.15821608225535538, + "learning_rate": 1.6744322326568957e-05, + "loss": 0.8585, + "step": 8027 + }, + { + "epoch": 3.639165911151405, + "grad_norm": 0.16338467458098535, + "learning_rate": 1.6734024009017474e-05, + "loss": 0.8734, + "step": 8028 + }, + { + "epoch": 3.6396192203082505, + "grad_norm": 0.1573719846851183, + "learning_rate": 1.6723728021621882e-05, + "loss": 0.8679, + "step": 8029 + }, + { + "epoch": 3.6400725294650953, + "grad_norm": 0.1475513356182183, + "learning_rate": 1.6713434365413318e-05, + "loss": 0.8584, + "step": 8030 + }, + { + "epoch": 3.64052583862194, + "grad_norm": 0.16956007923081948, + "learning_rate": 1.6703143041422757e-05, + "loss": 0.8794, + "step": 8031 + }, + { + "epoch": 3.6409791477787854, + "grad_norm": 0.14924530485532783, + "learning_rate": 1.669285405068089e-05, + "loss": 0.8557, + "step": 8032 + }, + { + "epoch": 3.6414324569356302, + "grad_norm": 0.1852030850705823, + "learning_rate": 1.668256739421817e-05, + "loss": 0.8725, + "step": 8033 + }, + { + "epoch": 3.641885766092475, + "grad_norm": 0.15521596890674375, + "learning_rate": 1.6672283073064875e-05, + "loss": 0.8789, + "step": 8034 + }, + { + "epoch": 3.64233907524932, + "grad_norm": 0.19666635092283163, + "learning_rate": 1.666200108825097e-05, + "loss": 0.8602, + "step": 8035 + }, + { + "epoch": 3.642792384406165, + "grad_norm": 0.15479074598332795, + "learning_rate": 1.6651721440806262e-05, + "loss": 0.8759, + "step": 8036 + }, + { + "epoch": 3.64324569356301, + "grad_norm": 0.17820410298877704, + "learning_rate": 1.6641444131760254e-05, + "loss": 0.8848, + "step": 8037 + }, + { + "epoch": 3.643699002719855, + "grad_norm": 0.16691763321316133, + "learning_rate": 1.6631169162142282e-05, + "loss": 0.8608, + "step": 8038 + }, + { + "epoch": 3.6441523118766996, + "grad_norm": 0.15255499344239304, + "learning_rate": 1.6620896532981382e-05, + "loss": 0.8586, + "step": 8039 + }, + { + "epoch": 3.644605621033545, + "grad_norm": 0.1706855636755965, + "learning_rate": 1.661062624530642e-05, + "loss": 0.8362, + "step": 8040 + }, + { + "epoch": 3.6450589301903897, + "grad_norm": 0.17820641549673705, + "learning_rate": 1.6600358300145965e-05, + "loss": 0.8491, + "step": 8041 + }, + { + "epoch": 3.6455122393472346, + "grad_norm": 0.18377155785712132, + "learning_rate": 1.6590092698528414e-05, + "loss": 0.867, + "step": 8042 + }, + { + "epoch": 3.64596554850408, + "grad_norm": 0.17611331850076625, + "learning_rate": 1.6579829441481872e-05, + "loss": 0.8594, + "step": 8043 + }, + { + "epoch": 3.6464188576609247, + "grad_norm": 0.193553923494444, + "learning_rate": 1.656956853003424e-05, + "loss": 0.8763, + "step": 8044 + }, + { + "epoch": 3.6468721668177695, + "grad_norm": 0.19182056191736171, + "learning_rate": 1.655930996521318e-05, + "loss": 0.8852, + "step": 8045 + }, + { + "epoch": 3.6473254759746148, + "grad_norm": 0.18981125034091317, + "learning_rate": 1.6549053748046128e-05, + "loss": 0.8795, + "step": 8046 + }, + { + "epoch": 3.6477787851314596, + "grad_norm": 0.1490247879091745, + "learning_rate": 1.6538799879560245e-05, + "loss": 0.8478, + "step": 8047 + }, + { + "epoch": 3.6482320942883044, + "grad_norm": 0.20861856924705455, + "learning_rate": 1.652854836078252e-05, + "loss": 0.8492, + "step": 8048 + }, + { + "epoch": 3.6486854034451497, + "grad_norm": 0.14277107590038352, + "learning_rate": 1.6518299192739662e-05, + "loss": 0.8591, + "step": 8049 + }, + { + "epoch": 3.6491387126019945, + "grad_norm": 0.21159928016246818, + "learning_rate": 1.6508052376458136e-05, + "loss": 0.8512, + "step": 8050 + }, + { + "epoch": 3.6495920217588393, + "grad_norm": 0.14490600201452603, + "learning_rate": 1.6497807912964215e-05, + "loss": 0.8694, + "step": 8051 + }, + { + "epoch": 3.6500453309156846, + "grad_norm": 0.17874162874082256, + "learning_rate": 1.6487565803283898e-05, + "loss": 0.8664, + "step": 8052 + }, + { + "epoch": 3.6504986400725294, + "grad_norm": 0.15950576476298217, + "learning_rate": 1.6477326048442973e-05, + "loss": 0.8582, + "step": 8053 + }, + { + "epoch": 3.6509519492293743, + "grad_norm": 0.17450755358836267, + "learning_rate": 1.6467088649466965e-05, + "loss": 0.8768, + "step": 8054 + }, + { + "epoch": 3.6514052583862195, + "grad_norm": 0.19597948035416068, + "learning_rate": 1.6456853607381207e-05, + "loss": 0.8534, + "step": 8055 + }, + { + "epoch": 3.6518585675430644, + "grad_norm": 0.1408740456181833, + "learning_rate": 1.6446620923210734e-05, + "loss": 0.8762, + "step": 8056 + }, + { + "epoch": 3.652311876699909, + "grad_norm": 0.1946124874794978, + "learning_rate": 1.6436390597980416e-05, + "loss": 0.8525, + "step": 8057 + }, + { + "epoch": 3.6527651858567545, + "grad_norm": 0.15515136418464623, + "learning_rate": 1.642616263271483e-05, + "loss": 0.8534, + "step": 8058 + }, + { + "epoch": 3.6532184950135993, + "grad_norm": 0.1731885757915372, + "learning_rate": 1.6415937028438326e-05, + "loss": 0.8688, + "step": 8059 + }, + { + "epoch": 3.653671804170444, + "grad_norm": 0.18222284524472132, + "learning_rate": 1.640571378617505e-05, + "loss": 0.8591, + "step": 8060 + }, + { + "epoch": 3.6541251133272894, + "grad_norm": 0.17076228235330962, + "learning_rate": 1.639549290694887e-05, + "loss": 0.8653, + "step": 8061 + }, + { + "epoch": 3.6545784224841342, + "grad_norm": 0.16871696590352459, + "learning_rate": 1.638527439178346e-05, + "loss": 0.8707, + "step": 8062 + }, + { + "epoch": 3.655031731640979, + "grad_norm": 0.17323270210576913, + "learning_rate": 1.637505824170221e-05, + "loss": 0.86, + "step": 8063 + }, + { + "epoch": 3.6554850407978243, + "grad_norm": 0.18583903246272052, + "learning_rate": 1.6364844457728338e-05, + "loss": 0.8613, + "step": 8064 + }, + { + "epoch": 3.655938349954669, + "grad_norm": 0.1634810063613239, + "learning_rate": 1.635463304088473e-05, + "loss": 0.8895, + "step": 8065 + }, + { + "epoch": 3.656391659111514, + "grad_norm": 0.18913715897728295, + "learning_rate": 1.634442399219412e-05, + "loss": 0.8553, + "step": 8066 + }, + { + "epoch": 3.6568449682683593, + "grad_norm": 0.17789236846700787, + "learning_rate": 1.6334217312678967e-05, + "loss": 0.865, + "step": 8067 + }, + { + "epoch": 3.657298277425204, + "grad_norm": 0.1784081619202631, + "learning_rate": 1.632401300336151e-05, + "loss": 0.865, + "step": 8068 + }, + { + "epoch": 3.657751586582049, + "grad_norm": 0.1896794187124639, + "learning_rate": 1.631381106526372e-05, + "loss": 0.8709, + "step": 8069 + }, + { + "epoch": 3.658204895738894, + "grad_norm": 0.19810687382182157, + "learning_rate": 1.6303611499407384e-05, + "loss": 0.8587, + "step": 8070 + }, + { + "epoch": 3.658658204895739, + "grad_norm": 0.20334526192838345, + "learning_rate": 1.6293414306813976e-05, + "loss": 0.85, + "step": 8071 + }, + { + "epoch": 3.659111514052584, + "grad_norm": 0.1908530735729352, + "learning_rate": 1.6283219488504814e-05, + "loss": 0.8587, + "step": 8072 + }, + { + "epoch": 3.659564823209429, + "grad_norm": 0.2979894290148619, + "learning_rate": 1.627302704550091e-05, + "loss": 0.8832, + "step": 8073 + }, + { + "epoch": 3.660018132366274, + "grad_norm": 0.2104350016692748, + "learning_rate": 1.6262836978823096e-05, + "loss": 0.8749, + "step": 8074 + }, + { + "epoch": 3.6604714415231188, + "grad_norm": 0.2243716814458314, + "learning_rate": 1.625264928949192e-05, + "loss": 0.8667, + "step": 8075 + }, + { + "epoch": 3.6609247506799636, + "grad_norm": 0.25638496311495185, + "learning_rate": 1.6242463978527695e-05, + "loss": 0.8609, + "step": 8076 + }, + { + "epoch": 3.661378059836809, + "grad_norm": 0.20812398059115253, + "learning_rate": 1.6232281046950537e-05, + "loss": 0.858, + "step": 8077 + }, + { + "epoch": 3.6618313689936537, + "grad_norm": 0.16407496028079427, + "learning_rate": 1.622210049578027e-05, + "loss": 0.8816, + "step": 8078 + }, + { + "epoch": 3.6622846781504985, + "grad_norm": 0.2462836073550724, + "learning_rate": 1.6211922326036534e-05, + "loss": 0.8678, + "step": 8079 + }, + { + "epoch": 3.6627379873073433, + "grad_norm": 0.17322744409598959, + "learning_rate": 1.6201746538738677e-05, + "loss": 0.8631, + "step": 8080 + }, + { + "epoch": 3.6631912964641886, + "grad_norm": 0.18369636557260974, + "learning_rate": 1.619157313490586e-05, + "loss": 0.8703, + "step": 8081 + }, + { + "epoch": 3.6636446056210334, + "grad_norm": 0.20563199061464635, + "learning_rate": 1.6181402115556954e-05, + "loss": 0.8473, + "step": 8082 + }, + { + "epoch": 3.6640979147778783, + "grad_norm": 0.16348257817576609, + "learning_rate": 1.6171233481710648e-05, + "loss": 0.8584, + "step": 8083 + }, + { + "epoch": 3.6645512239347235, + "grad_norm": 0.19853626809345218, + "learning_rate": 1.6161067234385316e-05, + "loss": 0.8519, + "step": 8084 + }, + { + "epoch": 3.6650045330915684, + "grad_norm": 0.24808981812006003, + "learning_rate": 1.615090337459918e-05, + "loss": 0.8627, + "step": 8085 + }, + { + "epoch": 3.665457842248413, + "grad_norm": 0.18642418071317912, + "learning_rate": 1.6140741903370144e-05, + "loss": 0.8781, + "step": 8086 + }, + { + "epoch": 3.6659111514052585, + "grad_norm": 0.27714672192179424, + "learning_rate": 1.6130582821715937e-05, + "loss": 0.8665, + "step": 8087 + }, + { + "epoch": 3.6663644605621033, + "grad_norm": 0.21828292750124226, + "learning_rate": 1.6120426130654006e-05, + "loss": 0.8477, + "step": 8088 + }, + { + "epoch": 3.666817769718948, + "grad_norm": 0.19328436034983248, + "learning_rate": 1.6110271831201588e-05, + "loss": 0.8701, + "step": 8089 + }, + { + "epoch": 3.6672710788757934, + "grad_norm": 0.2344698859159846, + "learning_rate": 1.610011992437566e-05, + "loss": 0.8554, + "step": 8090 + }, + { + "epoch": 3.6677243880326382, + "grad_norm": 0.15377068287303639, + "learning_rate": 1.6089970411192942e-05, + "loss": 0.8702, + "step": 8091 + }, + { + "epoch": 3.668177697189483, + "grad_norm": 0.20821208460054522, + "learning_rate": 1.6079823292669976e-05, + "loss": 0.8713, + "step": 8092 + }, + { + "epoch": 3.6686310063463283, + "grad_norm": 0.20268581629827676, + "learning_rate": 1.6069678569822993e-05, + "loss": 0.8595, + "step": 8093 + }, + { + "epoch": 3.669084315503173, + "grad_norm": 0.1494799288890957, + "learning_rate": 1.6059536243668044e-05, + "loss": 0.8587, + "step": 8094 + }, + { + "epoch": 3.669537624660018, + "grad_norm": 0.1840279379129244, + "learning_rate": 1.6049396315220887e-05, + "loss": 0.874, + "step": 8095 + }, + { + "epoch": 3.6699909338168633, + "grad_norm": 0.1718810035709623, + "learning_rate": 1.6039258785497094e-05, + "loss": 0.8613, + "step": 8096 + }, + { + "epoch": 3.670444242973708, + "grad_norm": 0.17267680713158304, + "learning_rate": 1.602912365551194e-05, + "loss": 0.8613, + "step": 8097 + }, + { + "epoch": 3.670897552130553, + "grad_norm": 0.1646660542962838, + "learning_rate": 1.6018990926280513e-05, + "loss": 0.8767, + "step": 8098 + }, + { + "epoch": 3.671350861287398, + "grad_norm": 0.16283725048623932, + "learning_rate": 1.6008860598817605e-05, + "loss": 0.8692, + "step": 8099 + }, + { + "epoch": 3.671804170444243, + "grad_norm": 0.16354345336561826, + "learning_rate": 1.5998732674137836e-05, + "loss": 0.8847, + "step": 8100 + }, + { + "epoch": 3.672257479601088, + "grad_norm": 0.21326879214161565, + "learning_rate": 1.598860715325553e-05, + "loss": 0.8585, + "step": 8101 + }, + { + "epoch": 3.672710788757933, + "grad_norm": 0.15029064803651818, + "learning_rate": 1.597848403718478e-05, + "loss": 0.8704, + "step": 8102 + }, + { + "epoch": 3.673164097914778, + "grad_norm": 0.17819195865362894, + "learning_rate": 1.5968363326939437e-05, + "loss": 0.8569, + "step": 8103 + }, + { + "epoch": 3.6736174070716228, + "grad_norm": 0.16332345613333366, + "learning_rate": 1.5958245023533156e-05, + "loss": 0.8409, + "step": 8104 + }, + { + "epoch": 3.674070716228468, + "grad_norm": 0.14277350221347382, + "learning_rate": 1.5948129127979285e-05, + "loss": 0.8672, + "step": 8105 + }, + { + "epoch": 3.674524025385313, + "grad_norm": 0.1779111729716305, + "learning_rate": 1.5938015641290962e-05, + "loss": 0.8815, + "step": 8106 + }, + { + "epoch": 3.6749773345421577, + "grad_norm": 0.1528663532082498, + "learning_rate": 1.59279045644811e-05, + "loss": 0.8577, + "step": 8107 + }, + { + "epoch": 3.675430643699003, + "grad_norm": 0.1573358829894495, + "learning_rate": 1.5917795898562327e-05, + "loss": 0.871, + "step": 8108 + }, + { + "epoch": 3.675883952855848, + "grad_norm": 0.14871758846492802, + "learning_rate": 1.590768964454709e-05, + "loss": 0.8646, + "step": 8109 + }, + { + "epoch": 3.6763372620126926, + "grad_norm": 0.1575355449407423, + "learning_rate": 1.5897585803447523e-05, + "loss": 0.8761, + "step": 8110 + }, + { + "epoch": 3.676790571169538, + "grad_norm": 0.1682060830986654, + "learning_rate": 1.5887484376275586e-05, + "loss": 0.8603, + "step": 8111 + }, + { + "epoch": 3.6772438803263827, + "grad_norm": 0.16798401846977778, + "learning_rate": 1.5877385364042943e-05, + "loss": 0.8523, + "step": 8112 + }, + { + "epoch": 3.6776971894832275, + "grad_norm": 0.15159326392767164, + "learning_rate": 1.5867288767761065e-05, + "loss": 0.8737, + "step": 8113 + }, + { + "epoch": 3.6781504986400724, + "grad_norm": 0.16893331352490248, + "learning_rate": 1.585719458844113e-05, + "loss": 0.8655, + "step": 8114 + }, + { + "epoch": 3.6786038077969176, + "grad_norm": 0.17547041933991073, + "learning_rate": 1.584710282709412e-05, + "loss": 0.8939, + "step": 8115 + }, + { + "epoch": 3.6790571169537625, + "grad_norm": 0.1677548785474805, + "learning_rate": 1.5837013484730745e-05, + "loss": 0.8597, + "step": 8116 + }, + { + "epoch": 3.6795104261106073, + "grad_norm": 0.527928092072223, + "learning_rate": 1.5826926562361474e-05, + "loss": 0.8884, + "step": 8117 + }, + { + "epoch": 3.679963735267452, + "grad_norm": 0.18693655416311228, + "learning_rate": 1.5816842060996563e-05, + "loss": 0.867, + "step": 8118 + }, + { + "epoch": 3.6804170444242974, + "grad_norm": 0.19785485355971913, + "learning_rate": 1.580675998164598e-05, + "loss": 0.8525, + "step": 8119 + }, + { + "epoch": 3.6808703535811422, + "grad_norm": 0.14204537128047098, + "learning_rate": 1.57966803253195e-05, + "loss": 0.8678, + "step": 8120 + }, + { + "epoch": 3.681323662737987, + "grad_norm": 0.20987683065548352, + "learning_rate": 1.5786603093026603e-05, + "loss": 0.8861, + "step": 8121 + }, + { + "epoch": 3.6817769718948323, + "grad_norm": 0.17526588083935973, + "learning_rate": 1.57765282857766e-05, + "loss": 0.8719, + "step": 8122 + }, + { + "epoch": 3.682230281051677, + "grad_norm": 0.21455305169578828, + "learning_rate": 1.5766455904578447e-05, + "loss": 0.8632, + "step": 8123 + }, + { + "epoch": 3.682683590208522, + "grad_norm": 0.1681072135108491, + "learning_rate": 1.5756385950440972e-05, + "loss": 0.8793, + "step": 8124 + }, + { + "epoch": 3.6831368993653673, + "grad_norm": 0.20179479114356802, + "learning_rate": 1.5746318424372683e-05, + "loss": 0.8596, + "step": 8125 + }, + { + "epoch": 3.683590208522212, + "grad_norm": 0.14074198447608058, + "learning_rate": 1.57362533273819e-05, + "loss": 0.8662, + "step": 8126 + }, + { + "epoch": 3.684043517679057, + "grad_norm": 0.20265594361608988, + "learning_rate": 1.572619066047664e-05, + "loss": 0.8753, + "step": 8127 + }, + { + "epoch": 3.684496826835902, + "grad_norm": 0.14355946303094314, + "learning_rate": 1.5716130424664734e-05, + "loss": 0.8774, + "step": 8128 + }, + { + "epoch": 3.684950135992747, + "grad_norm": 0.17379555009734524, + "learning_rate": 1.5706072620953727e-05, + "loss": 0.8459, + "step": 8129 + }, + { + "epoch": 3.685403445149592, + "grad_norm": 0.1754817879349993, + "learning_rate": 1.5696017250350955e-05, + "loss": 0.8803, + "step": 8130 + }, + { + "epoch": 3.685856754306437, + "grad_norm": 0.16394554092100733, + "learning_rate": 1.5685964313863488e-05, + "loss": 0.8605, + "step": 8131 + }, + { + "epoch": 3.686310063463282, + "grad_norm": 0.1975297890509491, + "learning_rate": 1.5675913812498132e-05, + "loss": 0.8712, + "step": 8132 + }, + { + "epoch": 3.6867633726201268, + "grad_norm": 0.15465235926553625, + "learning_rate": 1.566586574726152e-05, + "loss": 0.8762, + "step": 8133 + }, + { + "epoch": 3.687216681776972, + "grad_norm": 0.20147965122420813, + "learning_rate": 1.5655820119159946e-05, + "loss": 0.8569, + "step": 8134 + }, + { + "epoch": 3.687669990933817, + "grad_norm": 0.16154942000132635, + "learning_rate": 1.564577692919955e-05, + "loss": 0.8567, + "step": 8135 + }, + { + "epoch": 3.6881233000906617, + "grad_norm": 0.20837077172151852, + "learning_rate": 1.5635736178386157e-05, + "loss": 0.8697, + "step": 8136 + }, + { + "epoch": 3.688576609247507, + "grad_norm": 0.20157546175050758, + "learning_rate": 1.5625697867725403e-05, + "loss": 0.907, + "step": 8137 + }, + { + "epoch": 3.689029918404352, + "grad_norm": 0.19739428258458377, + "learning_rate": 1.5615661998222633e-05, + "loss": 0.8638, + "step": 8138 + }, + { + "epoch": 3.6894832275611966, + "grad_norm": 0.17816689690543708, + "learning_rate": 1.5605628570882986e-05, + "loss": 0.8723, + "step": 8139 + }, + { + "epoch": 3.689936536718042, + "grad_norm": 0.19435095267992455, + "learning_rate": 1.5595597586711325e-05, + "loss": 0.8606, + "step": 8140 + }, + { + "epoch": 3.6903898458748867, + "grad_norm": 0.16210518667005377, + "learning_rate": 1.5585569046712312e-05, + "loss": 0.8571, + "step": 8141 + }, + { + "epoch": 3.6908431550317315, + "grad_norm": 0.1981508517993694, + "learning_rate": 1.557554295189028e-05, + "loss": 0.865, + "step": 8142 + }, + { + "epoch": 3.691296464188577, + "grad_norm": 0.19830541090030132, + "learning_rate": 1.5565519303249424e-05, + "loss": 0.8716, + "step": 8143 + }, + { + "epoch": 3.6917497733454216, + "grad_norm": 0.16653408111454074, + "learning_rate": 1.5555498101793606e-05, + "loss": 0.8793, + "step": 8144 + }, + { + "epoch": 3.6922030825022665, + "grad_norm": 0.19447782280429304, + "learning_rate": 1.5545479348526504e-05, + "loss": 0.8373, + "step": 8145 + }, + { + "epoch": 3.6926563916591117, + "grad_norm": 0.1785473996087168, + "learning_rate": 1.5535463044451523e-05, + "loss": 0.8718, + "step": 8146 + }, + { + "epoch": 3.6931097008159566, + "grad_norm": 0.19432310359152247, + "learning_rate": 1.55254491905718e-05, + "loss": 0.8449, + "step": 8147 + }, + { + "epoch": 3.6935630099728014, + "grad_norm": 0.17080365043850454, + "learning_rate": 1.5515437787890283e-05, + "loss": 0.8653, + "step": 8148 + }, + { + "epoch": 3.6940163191296467, + "grad_norm": 0.1989451706196327, + "learning_rate": 1.5505428837409615e-05, + "loss": 0.8605, + "step": 8149 + }, + { + "epoch": 3.6944696282864915, + "grad_norm": 0.15244865500271745, + "learning_rate": 1.5495422340132254e-05, + "loss": 0.8709, + "step": 8150 + }, + { + "epoch": 3.6949229374433363, + "grad_norm": 0.18044724334943663, + "learning_rate": 1.548541829706035e-05, + "loss": 0.871, + "step": 8151 + }, + { + "epoch": 3.695376246600181, + "grad_norm": 0.15055542774466846, + "learning_rate": 1.5475416709195867e-05, + "loss": 0.8578, + "step": 8152 + }, + { + "epoch": 3.6958295557570264, + "grad_norm": 0.16700109790619744, + "learning_rate": 1.5465417577540453e-05, + "loss": 0.873, + "step": 8153 + }, + { + "epoch": 3.6962828649138713, + "grad_norm": 0.1588288380832494, + "learning_rate": 1.5455420903095597e-05, + "loss": 0.8643, + "step": 8154 + }, + { + "epoch": 3.696736174070716, + "grad_norm": 0.18399319178207088, + "learning_rate": 1.5445426686862453e-05, + "loss": 0.8772, + "step": 8155 + }, + { + "epoch": 3.6971894832275614, + "grad_norm": 0.17028380011001673, + "learning_rate": 1.5435434929842013e-05, + "loss": 0.8486, + "step": 8156 + }, + { + "epoch": 3.697642792384406, + "grad_norm": 0.1487307226715019, + "learning_rate": 1.5425445633034955e-05, + "loss": 0.8545, + "step": 8157 + }, + { + "epoch": 3.698096101541251, + "grad_norm": 0.14702556532670075, + "learning_rate": 1.5415458797441723e-05, + "loss": 0.8734, + "step": 8158 + }, + { + "epoch": 3.698549410698096, + "grad_norm": 0.1564723961662751, + "learning_rate": 1.540547442406256e-05, + "loss": 0.8819, + "step": 8159 + }, + { + "epoch": 3.699002719854941, + "grad_norm": 0.16402239701629948, + "learning_rate": 1.5395492513897417e-05, + "loss": 0.8685, + "step": 8160 + }, + { + "epoch": 3.699456029011786, + "grad_norm": 0.15312989652434095, + "learning_rate": 1.538551306794601e-05, + "loss": 0.8639, + "step": 8161 + }, + { + "epoch": 3.6999093381686308, + "grad_norm": 0.19214845022228294, + "learning_rate": 1.5375536087207796e-05, + "loss": 0.8619, + "step": 8162 + }, + { + "epoch": 3.700362647325476, + "grad_norm": 0.17239881449215252, + "learning_rate": 1.5365561572682026e-05, + "loss": 0.8711, + "step": 8163 + }, + { + "epoch": 3.700815956482321, + "grad_norm": 0.18748130759471954, + "learning_rate": 1.535558952536765e-05, + "loss": 0.874, + "step": 8164 + }, + { + "epoch": 3.7012692656391657, + "grad_norm": 0.133597796527886, + "learning_rate": 1.534561994626343e-05, + "loss": 0.8701, + "step": 8165 + }, + { + "epoch": 3.701722574796011, + "grad_norm": 0.19401945609234691, + "learning_rate": 1.5335652836367808e-05, + "loss": 0.8796, + "step": 8166 + }, + { + "epoch": 3.702175883952856, + "grad_norm": 0.15708511769575037, + "learning_rate": 1.532568819667906e-05, + "loss": 0.8748, + "step": 8167 + }, + { + "epoch": 3.7026291931097006, + "grad_norm": 0.19553830697261976, + "learning_rate": 1.5315726028195143e-05, + "loss": 0.868, + "step": 8168 + }, + { + "epoch": 3.703082502266546, + "grad_norm": 0.14957212939354664, + "learning_rate": 1.530576633191382e-05, + "loss": 0.8569, + "step": 8169 + }, + { + "epoch": 3.7035358114233907, + "grad_norm": 0.1806317877809897, + "learning_rate": 1.529580910883256e-05, + "loss": 0.855, + "step": 8170 + }, + { + "epoch": 3.7039891205802356, + "grad_norm": 0.15004929486687488, + "learning_rate": 1.528585435994864e-05, + "loss": 0.8646, + "step": 8171 + }, + { + "epoch": 3.704442429737081, + "grad_norm": 0.14501312957665025, + "learning_rate": 1.527590208625904e-05, + "loss": 0.8629, + "step": 8172 + }, + { + "epoch": 3.7048957388939256, + "grad_norm": 0.1364160527194733, + "learning_rate": 1.526595228876049e-05, + "loss": 0.8766, + "step": 8173 + }, + { + "epoch": 3.7053490480507705, + "grad_norm": 0.1727786349428715, + "learning_rate": 1.5256004968449527e-05, + "loss": 0.8548, + "step": 8174 + }, + { + "epoch": 3.7058023572076157, + "grad_norm": 0.13334703764977643, + "learning_rate": 1.5246060126322376e-05, + "loss": 0.8759, + "step": 8175 + }, + { + "epoch": 3.7062556663644606, + "grad_norm": 0.1726669504753101, + "learning_rate": 1.5236117763375062e-05, + "loss": 0.8689, + "step": 8176 + }, + { + "epoch": 3.7067089755213054, + "grad_norm": 0.1435223588556626, + "learning_rate": 1.5226177880603325e-05, + "loss": 0.8624, + "step": 8177 + }, + { + "epoch": 3.7071622846781507, + "grad_norm": 0.17620933731677513, + "learning_rate": 1.5216240479002693e-05, + "loss": 0.8679, + "step": 8178 + }, + { + "epoch": 3.7076155938349955, + "grad_norm": 0.17328668832079966, + "learning_rate": 1.5206305559568417e-05, + "loss": 0.881, + "step": 8179 + }, + { + "epoch": 3.7080689029918403, + "grad_norm": 0.18882820636908332, + "learning_rate": 1.5196373123295502e-05, + "loss": 0.8655, + "step": 8180 + }, + { + "epoch": 3.7085222121486856, + "grad_norm": 0.15672247819451596, + "learning_rate": 1.5186443171178708e-05, + "loss": 0.8682, + "step": 8181 + }, + { + "epoch": 3.7089755213055304, + "grad_norm": 0.1753285993831113, + "learning_rate": 1.5176515704212564e-05, + "loss": 0.8483, + "step": 8182 + }, + { + "epoch": 3.7094288304623753, + "grad_norm": 0.1672310538812343, + "learning_rate": 1.5166590723391318e-05, + "loss": 0.8777, + "step": 8183 + }, + { + "epoch": 3.7098821396192205, + "grad_norm": 0.15503447060333803, + "learning_rate": 1.5156668229709009e-05, + "loss": 0.8733, + "step": 8184 + }, + { + "epoch": 3.7103354487760654, + "grad_norm": 0.14990499492018458, + "learning_rate": 1.5146748224159372e-05, + "loss": 0.8531, + "step": 8185 + }, + { + "epoch": 3.71078875793291, + "grad_norm": 0.15416481427735193, + "learning_rate": 1.513683070773596e-05, + "loss": 0.88, + "step": 8186 + }, + { + "epoch": 3.7112420670897555, + "grad_norm": 0.14790247557063368, + "learning_rate": 1.5126915681432021e-05, + "loss": 0.8693, + "step": 8187 + }, + { + "epoch": 3.7116953762466003, + "grad_norm": 0.18575251949849794, + "learning_rate": 1.5117003146240565e-05, + "loss": 0.8497, + "step": 8188 + }, + { + "epoch": 3.712148685403445, + "grad_norm": 0.20337901884791262, + "learning_rate": 1.510709310315439e-05, + "loss": 0.8698, + "step": 8189 + }, + { + "epoch": 3.7126019945602904, + "grad_norm": 0.1495197926890527, + "learning_rate": 1.5097185553165985e-05, + "loss": 0.8717, + "step": 8190 + }, + { + "epoch": 3.713055303717135, + "grad_norm": 0.18138927972483468, + "learning_rate": 1.5087280497267647e-05, + "loss": 0.8599, + "step": 8191 + }, + { + "epoch": 3.71350861287398, + "grad_norm": 0.16004324899693176, + "learning_rate": 1.5077377936451369e-05, + "loss": 0.8503, + "step": 8192 + }, + { + "epoch": 3.713961922030825, + "grad_norm": 0.1662046373294843, + "learning_rate": 1.5067477871708951e-05, + "loss": 0.868, + "step": 8193 + }, + { + "epoch": 3.71441523118767, + "grad_norm": 0.1433015209017861, + "learning_rate": 1.5057580304031887e-05, + "loss": 0.8668, + "step": 8194 + }, + { + "epoch": 3.714868540344515, + "grad_norm": 0.16758991791939942, + "learning_rate": 1.5047685234411478e-05, + "loss": 0.8665, + "step": 8195 + }, + { + "epoch": 3.71532184950136, + "grad_norm": 0.17817894301999912, + "learning_rate": 1.5037792663838705e-05, + "loss": 0.8572, + "step": 8196 + }, + { + "epoch": 3.7157751586582046, + "grad_norm": 0.16218468756169577, + "learning_rate": 1.5027902593304378e-05, + "loss": 0.8675, + "step": 8197 + }, + { + "epoch": 3.71622846781505, + "grad_norm": 0.23138678140317648, + "learning_rate": 1.5018015023798995e-05, + "loss": 0.8768, + "step": 8198 + }, + { + "epoch": 3.7166817769718947, + "grad_norm": 0.16614099835201357, + "learning_rate": 1.500812995631283e-05, + "loss": 0.8696, + "step": 8199 + }, + { + "epoch": 3.7171350861287396, + "grad_norm": 0.17344422687731947, + "learning_rate": 1.4998247391835885e-05, + "loss": 0.8743, + "step": 8200 + }, + { + "epoch": 3.717588395285585, + "grad_norm": 0.15319216510752307, + "learning_rate": 1.4988367331357956e-05, + "loss": 0.8599, + "step": 8201 + }, + { + "epoch": 3.7180417044424297, + "grad_norm": 0.16702159230105237, + "learning_rate": 1.497848977586855e-05, + "loss": 0.87, + "step": 8202 + }, + { + "epoch": 3.7184950135992745, + "grad_norm": 0.16826525072291418, + "learning_rate": 1.4968614726356911e-05, + "loss": 0.8359, + "step": 8203 + }, + { + "epoch": 3.7189483227561198, + "grad_norm": 0.16070535388527077, + "learning_rate": 1.495874218381209e-05, + "loss": 0.8487, + "step": 8204 + }, + { + "epoch": 3.7194016319129646, + "grad_norm": 0.21435126927653436, + "learning_rate": 1.4948872149222818e-05, + "loss": 0.8668, + "step": 8205 + }, + { + "epoch": 3.7198549410698094, + "grad_norm": 0.15526739634986855, + "learning_rate": 1.4939004623577643e-05, + "loss": 0.8792, + "step": 8206 + }, + { + "epoch": 3.7203082502266547, + "grad_norm": 0.1953645939077843, + "learning_rate": 1.4929139607864786e-05, + "loss": 0.8606, + "step": 8207 + }, + { + "epoch": 3.7207615593834995, + "grad_norm": 0.1885940635552877, + "learning_rate": 1.49192771030723e-05, + "loss": 0.8769, + "step": 8208 + }, + { + "epoch": 3.7212148685403443, + "grad_norm": 0.1814993843429299, + "learning_rate": 1.4909417110187905e-05, + "loss": 0.8776, + "step": 8209 + }, + { + "epoch": 3.7216681776971896, + "grad_norm": 0.47749769182964347, + "learning_rate": 1.4899559630199143e-05, + "loss": 0.8661, + "step": 8210 + }, + { + "epoch": 3.7221214868540344, + "grad_norm": 0.17748854864384392, + "learning_rate": 1.4889704664093238e-05, + "loss": 0.8458, + "step": 8211 + }, + { + "epoch": 3.7225747960108793, + "grad_norm": 0.1509969558517047, + "learning_rate": 1.487985221285722e-05, + "loss": 0.8717, + "step": 8212 + }, + { + "epoch": 3.7230281051677245, + "grad_norm": 0.19088626176903073, + "learning_rate": 1.4870002277477835e-05, + "loss": 0.8776, + "step": 8213 + }, + { + "epoch": 3.7234814143245694, + "grad_norm": 0.16155078053788818, + "learning_rate": 1.4860154858941558e-05, + "loss": 0.8523, + "step": 8214 + }, + { + "epoch": 3.723934723481414, + "grad_norm": 0.16122563524841602, + "learning_rate": 1.4850309958234674e-05, + "loss": 0.8639, + "step": 8215 + }, + { + "epoch": 3.7243880326382595, + "grad_norm": 0.20732300932193629, + "learning_rate": 1.4840467576343146e-05, + "loss": 0.8635, + "step": 8216 + }, + { + "epoch": 3.7248413417951043, + "grad_norm": 0.14587465207615657, + "learning_rate": 1.4830627714252747e-05, + "loss": 0.8651, + "step": 8217 + }, + { + "epoch": 3.725294650951949, + "grad_norm": 0.21321508795244012, + "learning_rate": 1.4820790372948954e-05, + "loss": 0.8524, + "step": 8218 + }, + { + "epoch": 3.7257479601087944, + "grad_norm": 0.18635509989587787, + "learning_rate": 1.4810955553417006e-05, + "loss": 0.8468, + "step": 8219 + }, + { + "epoch": 3.726201269265639, + "grad_norm": 0.16200188033489787, + "learning_rate": 1.4801123256641873e-05, + "loss": 0.8575, + "step": 8220 + }, + { + "epoch": 3.726654578422484, + "grad_norm": 0.2280085063729398, + "learning_rate": 1.4791293483608317e-05, + "loss": 0.8813, + "step": 8221 + }, + { + "epoch": 3.7271078875793293, + "grad_norm": 0.198441689501382, + "learning_rate": 1.4781466235300794e-05, + "loss": 0.8861, + "step": 8222 + }, + { + "epoch": 3.727561196736174, + "grad_norm": 0.17700131189808585, + "learning_rate": 1.477164151270356e-05, + "loss": 0.873, + "step": 8223 + }, + { + "epoch": 3.728014505893019, + "grad_norm": 0.1933865515356514, + "learning_rate": 1.4761819316800555e-05, + "loss": 0.8685, + "step": 8224 + }, + { + "epoch": 3.7284678150498642, + "grad_norm": 0.17339909681191423, + "learning_rate": 1.4751999648575534e-05, + "loss": 0.8554, + "step": 8225 + }, + { + "epoch": 3.728921124206709, + "grad_norm": 0.16229587219067299, + "learning_rate": 1.4742182509011937e-05, + "loss": 0.8477, + "step": 8226 + }, + { + "epoch": 3.729374433363554, + "grad_norm": 0.17971616588781578, + "learning_rate": 1.4732367899093008e-05, + "loss": 0.8889, + "step": 8227 + }, + { + "epoch": 3.729827742520399, + "grad_norm": 0.1448586020490163, + "learning_rate": 1.4722555819801692e-05, + "loss": 0.8699, + "step": 8228 + }, + { + "epoch": 3.730281051677244, + "grad_norm": 0.19154647377976333, + "learning_rate": 1.4712746272120688e-05, + "loss": 0.8581, + "step": 8229 + }, + { + "epoch": 3.730734360834089, + "grad_norm": 0.14134163685978207, + "learning_rate": 1.4702939257032474e-05, + "loss": 0.8637, + "step": 8230 + }, + { + "epoch": 3.7311876699909337, + "grad_norm": 0.19745338705004073, + "learning_rate": 1.469313477551923e-05, + "loss": 0.8752, + "step": 8231 + }, + { + "epoch": 3.731640979147779, + "grad_norm": 0.1500157711629783, + "learning_rate": 1.4683332828562926e-05, + "loss": 0.8843, + "step": 8232 + }, + { + "epoch": 3.7320942883046238, + "grad_norm": 0.16708924021533983, + "learning_rate": 1.4673533417145227e-05, + "loss": 0.8603, + "step": 8233 + }, + { + "epoch": 3.7325475974614686, + "grad_norm": 0.1426248358680628, + "learning_rate": 1.4663736542247606e-05, + "loss": 0.8802, + "step": 8234 + }, + { + "epoch": 3.733000906618314, + "grad_norm": 0.21119021985015837, + "learning_rate": 1.4653942204851217e-05, + "loss": 0.8656, + "step": 8235 + }, + { + "epoch": 3.7334542157751587, + "grad_norm": 0.1417592230626061, + "learning_rate": 1.464415040593702e-05, + "loss": 0.8816, + "step": 8236 + }, + { + "epoch": 3.7339075249320035, + "grad_norm": 0.20483886792694933, + "learning_rate": 1.4634361146485683e-05, + "loss": 0.8776, + "step": 8237 + }, + { + "epoch": 3.7343608340888483, + "grad_norm": 0.1359471450383502, + "learning_rate": 1.4624574427477623e-05, + "loss": 0.8748, + "step": 8238 + }, + { + "epoch": 3.7348141432456936, + "grad_norm": 0.19324757881981816, + "learning_rate": 1.4614790249892999e-05, + "loss": 0.848, + "step": 8239 + }, + { + "epoch": 3.7352674524025384, + "grad_norm": 0.15176515662909792, + "learning_rate": 1.4605008614711746e-05, + "loss": 0.8832, + "step": 8240 + }, + { + "epoch": 3.7357207615593833, + "grad_norm": 0.16566108699500245, + "learning_rate": 1.4595229522913506e-05, + "loss": 0.8629, + "step": 8241 + }, + { + "epoch": 3.7361740707162285, + "grad_norm": 0.15889645643644199, + "learning_rate": 1.4585452975477706e-05, + "loss": 0.8684, + "step": 8242 + }, + { + "epoch": 3.7366273798730734, + "grad_norm": 0.15518768631776472, + "learning_rate": 1.4575678973383478e-05, + "loss": 0.8653, + "step": 8243 + }, + { + "epoch": 3.737080689029918, + "grad_norm": 0.15430280538506766, + "learning_rate": 1.4565907517609713e-05, + "loss": 0.8796, + "step": 8244 + }, + { + "epoch": 3.7375339981867635, + "grad_norm": 0.1636681822501628, + "learning_rate": 1.4556138609135069e-05, + "loss": 0.8677, + "step": 8245 + }, + { + "epoch": 3.7379873073436083, + "grad_norm": 0.16852847927473705, + "learning_rate": 1.4546372248937911e-05, + "loss": 0.8832, + "step": 8246 + }, + { + "epoch": 3.738440616500453, + "grad_norm": 0.16649590406322984, + "learning_rate": 1.453660843799639e-05, + "loss": 0.8666, + "step": 8247 + }, + { + "epoch": 3.7388939256572984, + "grad_norm": 0.19187188252303153, + "learning_rate": 1.4526847177288356e-05, + "loss": 0.8694, + "step": 8248 + }, + { + "epoch": 3.739347234814143, + "grad_norm": 0.1729471040297731, + "learning_rate": 1.4517088467791461e-05, + "loss": 0.8686, + "step": 8249 + }, + { + "epoch": 3.739800543970988, + "grad_norm": 0.17700228944334487, + "learning_rate": 1.4507332310483029e-05, + "loss": 0.8715, + "step": 8250 + }, + { + "epoch": 3.7402538531278333, + "grad_norm": 0.16623700905009015, + "learning_rate": 1.44975787063402e-05, + "loss": 0.8669, + "step": 8251 + }, + { + "epoch": 3.740707162284678, + "grad_norm": 0.17573568741684095, + "learning_rate": 1.4487827656339804e-05, + "loss": 0.8609, + "step": 8252 + }, + { + "epoch": 3.741160471441523, + "grad_norm": 0.17166054090208105, + "learning_rate": 1.447807916145846e-05, + "loss": 0.8734, + "step": 8253 + }, + { + "epoch": 3.7416137805983682, + "grad_norm": 0.15151405943510338, + "learning_rate": 1.4468333222672497e-05, + "loss": 0.861, + "step": 8254 + }, + { + "epoch": 3.742067089755213, + "grad_norm": 0.18909341870532967, + "learning_rate": 1.4458589840957982e-05, + "loss": 0.856, + "step": 8255 + }, + { + "epoch": 3.742520398912058, + "grad_norm": 0.19205586275481024, + "learning_rate": 1.4448849017290774e-05, + "loss": 0.867, + "step": 8256 + }, + { + "epoch": 3.742973708068903, + "grad_norm": 0.16306571691730315, + "learning_rate": 1.443911075264643e-05, + "loss": 0.8986, + "step": 8257 + }, + { + "epoch": 3.743427017225748, + "grad_norm": 0.17476678996414283, + "learning_rate": 1.4429375048000252e-05, + "loss": 0.9017, + "step": 8258 + }, + { + "epoch": 3.743880326382593, + "grad_norm": 0.1652086507976759, + "learning_rate": 1.4419641904327329e-05, + "loss": 0.8653, + "step": 8259 + }, + { + "epoch": 3.744333635539438, + "grad_norm": 0.1771980889027628, + "learning_rate": 1.4409911322602446e-05, + "loss": 0.8651, + "step": 8260 + }, + { + "epoch": 3.744786944696283, + "grad_norm": 0.19732753642197637, + "learning_rate": 1.4400183303800139e-05, + "loss": 0.8764, + "step": 8261 + }, + { + "epoch": 3.7452402538531278, + "grad_norm": 0.13696544915252487, + "learning_rate": 1.4390457848894724e-05, + "loss": 0.852, + "step": 8262 + }, + { + "epoch": 3.745693563009973, + "grad_norm": 0.1782108665576462, + "learning_rate": 1.438073495886021e-05, + "loss": 0.85, + "step": 8263 + }, + { + "epoch": 3.746146872166818, + "grad_norm": 0.16367230756728773, + "learning_rate": 1.4371014634670393e-05, + "loss": 0.8607, + "step": 8264 + }, + { + "epoch": 3.7466001813236627, + "grad_norm": 0.18873607296021952, + "learning_rate": 1.436129687729877e-05, + "loss": 0.8786, + "step": 8265 + }, + { + "epoch": 3.747053490480508, + "grad_norm": 0.14872256155987865, + "learning_rate": 1.4351581687718631e-05, + "loss": 0.8776, + "step": 8266 + }, + { + "epoch": 3.747506799637353, + "grad_norm": 0.2023205911680118, + "learning_rate": 1.4341869066902949e-05, + "loss": 0.8701, + "step": 8267 + }, + { + "epoch": 3.7479601087941976, + "grad_norm": 0.14796383308314104, + "learning_rate": 1.4332159015824503e-05, + "loss": 0.8757, + "step": 8268 + }, + { + "epoch": 3.748413417951043, + "grad_norm": 0.20325264018698197, + "learning_rate": 1.4322451535455769e-05, + "loss": 0.874, + "step": 8269 + }, + { + "epoch": 3.7488667271078877, + "grad_norm": 0.16121402447515207, + "learning_rate": 1.4312746626768963e-05, + "loss": 0.8771, + "step": 8270 + }, + { + "epoch": 3.7493200362647325, + "grad_norm": 0.21926262773846666, + "learning_rate": 1.4303044290736092e-05, + "loss": 0.891, + "step": 8271 + }, + { + "epoch": 3.7497733454215774, + "grad_norm": 0.17750537650792728, + "learning_rate": 1.429334452832884e-05, + "loss": 0.8561, + "step": 8272 + }, + { + "epoch": 3.7502266545784226, + "grad_norm": 0.16703089801807444, + "learning_rate": 1.4283647340518702e-05, + "loss": 0.8714, + "step": 8273 + }, + { + "epoch": 3.7506799637352675, + "grad_norm": 0.2096679364443424, + "learning_rate": 1.4273952728276844e-05, + "loss": 0.8351, + "step": 8274 + }, + { + "epoch": 3.7511332728921123, + "grad_norm": 0.14382317294192784, + "learning_rate": 1.4264260692574259e-05, + "loss": 0.8756, + "step": 8275 + }, + { + "epoch": 3.751586582048957, + "grad_norm": 0.18377149655085803, + "learning_rate": 1.4254571234381568e-05, + "loss": 0.8804, + "step": 8276 + }, + { + "epoch": 3.7520398912058024, + "grad_norm": 0.15476367778621056, + "learning_rate": 1.424488435466925e-05, + "loss": 0.8505, + "step": 8277 + }, + { + "epoch": 3.752493200362647, + "grad_norm": 0.15450867483209485, + "learning_rate": 1.4235200054407442e-05, + "loss": 0.8498, + "step": 8278 + }, + { + "epoch": 3.752946509519492, + "grad_norm": 0.156744726167166, + "learning_rate": 1.4225518334566087e-05, + "loss": 0.851, + "step": 8279 + }, + { + "epoch": 3.7533998186763373, + "grad_norm": 0.14472125625126103, + "learning_rate": 1.4215839196114804e-05, + "loss": 0.8501, + "step": 8280 + }, + { + "epoch": 3.753853127833182, + "grad_norm": 0.16895769865759544, + "learning_rate": 1.4206162640023019e-05, + "loss": 0.8548, + "step": 8281 + }, + { + "epoch": 3.754306436990027, + "grad_norm": 0.14310657609782787, + "learning_rate": 1.4196488667259836e-05, + "loss": 0.8739, + "step": 8282 + }, + { + "epoch": 3.7547597461468722, + "grad_norm": 0.1690246777278434, + "learning_rate": 1.4186817278794163e-05, + "loss": 0.8708, + "step": 8283 + }, + { + "epoch": 3.755213055303717, + "grad_norm": 0.15859105978665014, + "learning_rate": 1.4177148475594588e-05, + "loss": 0.8543, + "step": 8284 + }, + { + "epoch": 3.755666364460562, + "grad_norm": 0.14795560337470548, + "learning_rate": 1.4167482258629503e-05, + "loss": 0.8674, + "step": 8285 + }, + { + "epoch": 3.756119673617407, + "grad_norm": 0.15200839895271667, + "learning_rate": 1.4157818628866987e-05, + "loss": 0.8633, + "step": 8286 + }, + { + "epoch": 3.756572982774252, + "grad_norm": 0.2046986019481979, + "learning_rate": 1.4148157587274871e-05, + "loss": 0.8617, + "step": 8287 + }, + { + "epoch": 3.757026291931097, + "grad_norm": 0.13649359127067434, + "learning_rate": 1.4138499134820763e-05, + "loss": 0.869, + "step": 8288 + }, + { + "epoch": 3.757479601087942, + "grad_norm": 0.15636948910415663, + "learning_rate": 1.4128843272471961e-05, + "loss": 0.8752, + "step": 8289 + }, + { + "epoch": 3.757932910244787, + "grad_norm": 0.17545326587738996, + "learning_rate": 1.411919000119555e-05, + "loss": 0.8611, + "step": 8290 + }, + { + "epoch": 3.7583862194016318, + "grad_norm": 0.13373128027252218, + "learning_rate": 1.410953932195831e-05, + "loss": 0.8805, + "step": 8291 + }, + { + "epoch": 3.758839528558477, + "grad_norm": 0.19943890163606115, + "learning_rate": 1.409989123572681e-05, + "loss": 0.8833, + "step": 8292 + }, + { + "epoch": 3.759292837715322, + "grad_norm": 0.15443826628627402, + "learning_rate": 1.4090245743467312e-05, + "loss": 0.8603, + "step": 8293 + }, + { + "epoch": 3.7597461468721667, + "grad_norm": 0.19453835880776926, + "learning_rate": 1.4080602846145874e-05, + "loss": 0.8602, + "step": 8294 + }, + { + "epoch": 3.760199456029012, + "grad_norm": 0.18066424129669661, + "learning_rate": 1.4070962544728209e-05, + "loss": 0.851, + "step": 8295 + }, + { + "epoch": 3.760652765185857, + "grad_norm": 0.15845578196487267, + "learning_rate": 1.4061324840179862e-05, + "loss": 0.851, + "step": 8296 + }, + { + "epoch": 3.7611060743427016, + "grad_norm": 0.1579941604671543, + "learning_rate": 1.4051689733466054e-05, + "loss": 0.8642, + "step": 8297 + }, + { + "epoch": 3.761559383499547, + "grad_norm": 0.13654469546799458, + "learning_rate": 1.404205722555179e-05, + "loss": 0.8544, + "step": 8298 + }, + { + "epoch": 3.7620126926563917, + "grad_norm": 0.19452985310127874, + "learning_rate": 1.4032427317401775e-05, + "loss": 0.8932, + "step": 8299 + }, + { + "epoch": 3.7624660018132365, + "grad_norm": 0.14643552816544483, + "learning_rate": 1.4022800009980495e-05, + "loss": 0.8796, + "step": 8300 + }, + { + "epoch": 3.762919310970082, + "grad_norm": 0.2316827895033946, + "learning_rate": 1.4013175304252134e-05, + "loss": 0.8739, + "step": 8301 + }, + { + "epoch": 3.7633726201269266, + "grad_norm": 0.18704442076109135, + "learning_rate": 1.400355320118063e-05, + "loss": 0.8709, + "step": 8302 + }, + { + "epoch": 3.7638259292837715, + "grad_norm": 0.19907020196433298, + "learning_rate": 1.3993933701729687e-05, + "loss": 0.846, + "step": 8303 + }, + { + "epoch": 3.7642792384406167, + "grad_norm": 0.19552830480156078, + "learning_rate": 1.3984316806862706e-05, + "loss": 0.8832, + "step": 8304 + }, + { + "epoch": 3.7647325475974616, + "grad_norm": 0.17970678827632833, + "learning_rate": 1.3974702517542862e-05, + "loss": 0.881, + "step": 8305 + }, + { + "epoch": 3.7651858567543064, + "grad_norm": 0.19463259928283091, + "learning_rate": 1.396509083473304e-05, + "loss": 0.8695, + "step": 8306 + }, + { + "epoch": 3.7656391659111517, + "grad_norm": 0.17300586238604768, + "learning_rate": 1.3955481759395895e-05, + "loss": 0.8654, + "step": 8307 + }, + { + "epoch": 3.7660924750679965, + "grad_norm": 0.1809293418136271, + "learning_rate": 1.3945875292493782e-05, + "loss": 0.8765, + "step": 8308 + }, + { + "epoch": 3.7665457842248413, + "grad_norm": 0.1620891942997032, + "learning_rate": 1.3936271434988849e-05, + "loss": 0.8721, + "step": 8309 + }, + { + "epoch": 3.766999093381686, + "grad_norm": 0.15862972847377485, + "learning_rate": 1.3926670187842915e-05, + "loss": 0.8729, + "step": 8310 + }, + { + "epoch": 3.7674524025385314, + "grad_norm": 0.18235352662387655, + "learning_rate": 1.3917071552017597e-05, + "loss": 0.8695, + "step": 8311 + }, + { + "epoch": 3.7679057116953762, + "grad_norm": 0.20237389120051782, + "learning_rate": 1.3907475528474229e-05, + "loss": 0.8483, + "step": 8312 + }, + { + "epoch": 3.768359020852221, + "grad_norm": 0.1577910604967603, + "learning_rate": 1.3897882118173853e-05, + "loss": 0.872, + "step": 8313 + }, + { + "epoch": 3.7688123300090663, + "grad_norm": 0.15544590735527447, + "learning_rate": 1.388829132207731e-05, + "loss": 0.8728, + "step": 8314 + }, + { + "epoch": 3.769265639165911, + "grad_norm": 0.1624852269789361, + "learning_rate": 1.3878703141145132e-05, + "loss": 0.8738, + "step": 8315 + }, + { + "epoch": 3.769718948322756, + "grad_norm": 0.1681644196348099, + "learning_rate": 1.3869117576337603e-05, + "loss": 0.8879, + "step": 8316 + }, + { + "epoch": 3.770172257479601, + "grad_norm": 0.1609101687003592, + "learning_rate": 1.3859534628614734e-05, + "loss": 0.887, + "step": 8317 + }, + { + "epoch": 3.770625566636446, + "grad_norm": 0.16086914936862504, + "learning_rate": 1.384995429893631e-05, + "loss": 0.8371, + "step": 8318 + }, + { + "epoch": 3.771078875793291, + "grad_norm": 0.16205052617992144, + "learning_rate": 1.3840376588261802e-05, + "loss": 0.8825, + "step": 8319 + }, + { + "epoch": 3.7715321849501358, + "grad_norm": 0.19218673737643302, + "learning_rate": 1.3830801497550473e-05, + "loss": 0.87, + "step": 8320 + }, + { + "epoch": 3.771985494106981, + "grad_norm": 0.17403738388313186, + "learning_rate": 1.3821229027761271e-05, + "loss": 0.866, + "step": 8321 + }, + { + "epoch": 3.772438803263826, + "grad_norm": 0.16828445611067652, + "learning_rate": 1.3811659179852935e-05, + "loss": 0.8581, + "step": 8322 + }, + { + "epoch": 3.7728921124206707, + "grad_norm": 0.1642792008281721, + "learning_rate": 1.3802091954783885e-05, + "loss": 0.8698, + "step": 8323 + }, + { + "epoch": 3.773345421577516, + "grad_norm": 0.1664157315387949, + "learning_rate": 1.3792527353512336e-05, + "loss": 0.8788, + "step": 8324 + }, + { + "epoch": 3.773798730734361, + "grad_norm": 0.14638501279193883, + "learning_rate": 1.3782965376996177e-05, + "loss": 0.8474, + "step": 8325 + }, + { + "epoch": 3.7742520398912056, + "grad_norm": 0.17821651987174517, + "learning_rate": 1.3773406026193099e-05, + "loss": 0.8703, + "step": 8326 + }, + { + "epoch": 3.774705349048051, + "grad_norm": 0.16668880778942421, + "learning_rate": 1.3763849302060486e-05, + "loss": 0.861, + "step": 8327 + }, + { + "epoch": 3.7751586582048957, + "grad_norm": 0.18714635693400142, + "learning_rate": 1.375429520555546e-05, + "loss": 0.8861, + "step": 8328 + }, + { + "epoch": 3.7756119673617405, + "grad_norm": 0.2538905901201943, + "learning_rate": 1.374474373763492e-05, + "loss": 0.8736, + "step": 8329 + }, + { + "epoch": 3.776065276518586, + "grad_norm": 0.17532952597128115, + "learning_rate": 1.3735194899255438e-05, + "loss": 0.8585, + "step": 8330 + }, + { + "epoch": 3.7765185856754306, + "grad_norm": 0.22010768433997027, + "learning_rate": 1.3725648691373397e-05, + "loss": 0.8677, + "step": 8331 + }, + { + "epoch": 3.7769718948322755, + "grad_norm": 0.23213185665142072, + "learning_rate": 1.3716105114944842e-05, + "loss": 0.859, + "step": 8332 + }, + { + "epoch": 3.7774252039891207, + "grad_norm": 0.12876286443812845, + "learning_rate": 1.3706564170925631e-05, + "loss": 0.8474, + "step": 8333 + }, + { + "epoch": 3.7778785131459656, + "grad_norm": 0.218004795298325, + "learning_rate": 1.3697025860271272e-05, + "loss": 0.8579, + "step": 8334 + }, + { + "epoch": 3.7783318223028104, + "grad_norm": 0.16450310471319518, + "learning_rate": 1.3687490183937087e-05, + "loss": 0.8778, + "step": 8335 + }, + { + "epoch": 3.7787851314596557, + "grad_norm": 0.18596375654202427, + "learning_rate": 1.3677957142878077e-05, + "loss": 0.8766, + "step": 8336 + }, + { + "epoch": 3.7792384406165005, + "grad_norm": 0.16920179132073182, + "learning_rate": 1.3668426738049032e-05, + "loss": 0.8594, + "step": 8337 + }, + { + "epoch": 3.7796917497733453, + "grad_norm": 0.2123769140435139, + "learning_rate": 1.3658898970404421e-05, + "loss": 0.8596, + "step": 8338 + }, + { + "epoch": 3.7801450589301906, + "grad_norm": 0.21523849199924477, + "learning_rate": 1.364937384089851e-05, + "loss": 0.8811, + "step": 8339 + }, + { + "epoch": 3.7805983680870354, + "grad_norm": 0.24856809878877328, + "learning_rate": 1.3639851350485236e-05, + "loss": 0.8798, + "step": 8340 + }, + { + "epoch": 3.7810516772438802, + "grad_norm": 0.19661601337417922, + "learning_rate": 1.3630331500118334e-05, + "loss": 0.861, + "step": 8341 + }, + { + "epoch": 3.7815049864007255, + "grad_norm": 0.2204243222261221, + "learning_rate": 1.3620814290751234e-05, + "loss": 0.8593, + "step": 8342 + }, + { + "epoch": 3.7819582955575703, + "grad_norm": 0.17062844848184788, + "learning_rate": 1.3611299723337097e-05, + "loss": 0.8568, + "step": 8343 + }, + { + "epoch": 3.782411604714415, + "grad_norm": 0.1804731817025905, + "learning_rate": 1.3601787798828858e-05, + "loss": 0.8678, + "step": 8344 + }, + { + "epoch": 3.7828649138712604, + "grad_norm": 0.19867466016650293, + "learning_rate": 1.3592278518179138e-05, + "loss": 0.8662, + "step": 8345 + }, + { + "epoch": 3.7833182230281053, + "grad_norm": 0.1637325231347063, + "learning_rate": 1.3582771882340354e-05, + "loss": 0.8759, + "step": 8346 + }, + { + "epoch": 3.78377153218495, + "grad_norm": 0.21693828924116523, + "learning_rate": 1.357326789226459e-05, + "loss": 0.8696, + "step": 8347 + }, + { + "epoch": 3.7842248413417954, + "grad_norm": 0.15405316505299158, + "learning_rate": 1.3563766548903722e-05, + "loss": 0.8607, + "step": 8348 + }, + { + "epoch": 3.78467815049864, + "grad_norm": 0.22033050274913982, + "learning_rate": 1.355426785320932e-05, + "loss": 0.8751, + "step": 8349 + }, + { + "epoch": 3.785131459655485, + "grad_norm": 0.16688111761974814, + "learning_rate": 1.354477180613273e-05, + "loss": 0.857, + "step": 8350 + }, + { + "epoch": 3.78558476881233, + "grad_norm": 0.20018589705891468, + "learning_rate": 1.3535278408624976e-05, + "loss": 0.8675, + "step": 8351 + }, + { + "epoch": 3.786038077969175, + "grad_norm": 0.20475536423354568, + "learning_rate": 1.3525787661636898e-05, + "loss": 0.8716, + "step": 8352 + }, + { + "epoch": 3.78649138712602, + "grad_norm": 0.19513548749132514, + "learning_rate": 1.3516299566118965e-05, + "loss": 0.8626, + "step": 8353 + }, + { + "epoch": 3.786944696282865, + "grad_norm": 0.15150195751141057, + "learning_rate": 1.3506814123021474e-05, + "loss": 0.8981, + "step": 8354 + }, + { + "epoch": 3.7873980054397096, + "grad_norm": 0.21741799602015913, + "learning_rate": 1.3497331333294402e-05, + "loss": 0.8779, + "step": 8355 + }, + { + "epoch": 3.787851314596555, + "grad_norm": 0.15275789816430904, + "learning_rate": 1.34878511978875e-05, + "loss": 0.8666, + "step": 8356 + }, + { + "epoch": 3.7883046237533997, + "grad_norm": 0.2213903402496919, + "learning_rate": 1.3478373717750218e-05, + "loss": 0.861, + "step": 8357 + }, + { + "epoch": 3.7887579329102445, + "grad_norm": 0.16064048110076165, + "learning_rate": 1.3468898893831735e-05, + "loss": 0.8802, + "step": 8358 + }, + { + "epoch": 3.78921124206709, + "grad_norm": 0.17734577493436499, + "learning_rate": 1.3459426727081022e-05, + "loss": 0.8568, + "step": 8359 + }, + { + "epoch": 3.7896645512239346, + "grad_norm": 0.18172427281590245, + "learning_rate": 1.3449957218446703e-05, + "loss": 0.8515, + "step": 8360 + }, + { + "epoch": 3.7901178603807795, + "grad_norm": 0.17116698860655613, + "learning_rate": 1.344049036887722e-05, + "loss": 0.869, + "step": 8361 + }, + { + "epoch": 3.7905711695376247, + "grad_norm": 0.1970977587226363, + "learning_rate": 1.3431026179320661e-05, + "loss": 0.8775, + "step": 8362 + }, + { + "epoch": 3.7910244786944696, + "grad_norm": 0.18011794253749583, + "learning_rate": 1.3421564650724932e-05, + "loss": 0.8678, + "step": 8363 + }, + { + "epoch": 3.7914777878513144, + "grad_norm": 0.16151074727382977, + "learning_rate": 1.3412105784037603e-05, + "loss": 0.8577, + "step": 8364 + }, + { + "epoch": 3.7919310970081597, + "grad_norm": 0.16881761888289987, + "learning_rate": 1.3402649580206037e-05, + "loss": 0.8588, + "step": 8365 + }, + { + "epoch": 3.7923844061650045, + "grad_norm": 0.15322830139922486, + "learning_rate": 1.3393196040177263e-05, + "loss": 0.864, + "step": 8366 + }, + { + "epoch": 3.7928377153218493, + "grad_norm": 0.1544647696829164, + "learning_rate": 1.338374516489812e-05, + "loss": 0.8507, + "step": 8367 + }, + { + "epoch": 3.7932910244786946, + "grad_norm": 0.15781444127023234, + "learning_rate": 1.3374296955315123e-05, + "loss": 0.8757, + "step": 8368 + }, + { + "epoch": 3.7937443336355394, + "grad_norm": 0.20555995587995746, + "learning_rate": 1.3364851412374522e-05, + "loss": 0.8641, + "step": 8369 + }, + { + "epoch": 3.7941976427923843, + "grad_norm": 0.14505751728324723, + "learning_rate": 1.3355408537022342e-05, + "loss": 0.8646, + "step": 8370 + }, + { + "epoch": 3.7946509519492295, + "grad_norm": 0.14773420555273392, + "learning_rate": 1.3345968330204308e-05, + "loss": 0.8615, + "step": 8371 + }, + { + "epoch": 3.7951042611060744, + "grad_norm": 0.14276812579973686, + "learning_rate": 1.333653079286588e-05, + "loss": 0.8635, + "step": 8372 + }, + { + "epoch": 3.795557570262919, + "grad_norm": 0.18320058864858763, + "learning_rate": 1.332709592595224e-05, + "loss": 0.8621, + "step": 8373 + }, + { + "epoch": 3.7960108794197644, + "grad_norm": 0.150397505091882, + "learning_rate": 1.3317663730408349e-05, + "loss": 0.8541, + "step": 8374 + }, + { + "epoch": 3.7964641885766093, + "grad_norm": 0.20013544086034435, + "learning_rate": 1.3308234207178838e-05, + "loss": 0.8668, + "step": 8375 + }, + { + "epoch": 3.796917497733454, + "grad_norm": 0.17775328901442364, + "learning_rate": 1.3298807357208134e-05, + "loss": 0.8715, + "step": 8376 + }, + { + "epoch": 3.7973708068902994, + "grad_norm": 0.14704009067079046, + "learning_rate": 1.328938318144033e-05, + "loss": 0.8646, + "step": 8377 + }, + { + "epoch": 3.797824116047144, + "grad_norm": 0.16942291562988712, + "learning_rate": 1.3279961680819313e-05, + "loss": 0.8561, + "step": 8378 + }, + { + "epoch": 3.798277425203989, + "grad_norm": 0.16297248982094464, + "learning_rate": 1.3270542856288655e-05, + "loss": 0.8729, + "step": 8379 + }, + { + "epoch": 3.7987307343608343, + "grad_norm": 0.1684345591008798, + "learning_rate": 1.3261126708791694e-05, + "loss": 0.8704, + "step": 8380 + }, + { + "epoch": 3.799184043517679, + "grad_norm": 0.19162327727940093, + "learning_rate": 1.3251713239271467e-05, + "loss": 0.8784, + "step": 8381 + }, + { + "epoch": 3.799637352674524, + "grad_norm": 0.14561631675319922, + "learning_rate": 1.3242302448670783e-05, + "loss": 0.8537, + "step": 8382 + }, + { + "epoch": 3.8000906618313692, + "grad_norm": 0.19005990441076406, + "learning_rate": 1.3232894337932148e-05, + "loss": 0.8712, + "step": 8383 + }, + { + "epoch": 3.800543970988214, + "grad_norm": 0.18299265153449598, + "learning_rate": 1.3223488907997797e-05, + "loss": 0.8612, + "step": 8384 + }, + { + "epoch": 3.800997280145059, + "grad_norm": 0.1768689942699595, + "learning_rate": 1.3214086159809743e-05, + "loss": 0.8663, + "step": 8385 + }, + { + "epoch": 3.801450589301904, + "grad_norm": 0.23570872899830203, + "learning_rate": 1.3204686094309667e-05, + "loss": 0.8883, + "step": 8386 + }, + { + "epoch": 3.801903898458749, + "grad_norm": 0.1595681467858597, + "learning_rate": 1.3195288712439042e-05, + "loss": 0.8722, + "step": 8387 + }, + { + "epoch": 3.802357207615594, + "grad_norm": 0.22201433306751292, + "learning_rate": 1.318589401513902e-05, + "loss": 0.8737, + "step": 8388 + }, + { + "epoch": 3.8028105167724386, + "grad_norm": 0.19104522206873675, + "learning_rate": 1.317650200335053e-05, + "loss": 0.8519, + "step": 8389 + }, + { + "epoch": 3.803263825929284, + "grad_norm": 0.17090609273984594, + "learning_rate": 1.3167112678014196e-05, + "loss": 0.872, + "step": 8390 + }, + { + "epoch": 3.8037171350861287, + "grad_norm": 0.19264746317614878, + "learning_rate": 1.3157726040070387e-05, + "loss": 0.8517, + "step": 8391 + }, + { + "epoch": 3.8041704442429736, + "grad_norm": 0.139432887020941, + "learning_rate": 1.3148342090459193e-05, + "loss": 0.8786, + "step": 8392 + }, + { + "epoch": 3.804623753399819, + "grad_norm": 0.162719714495317, + "learning_rate": 1.313896083012046e-05, + "loss": 0.8586, + "step": 8393 + }, + { + "epoch": 3.8050770625566637, + "grad_norm": 0.14984127101905992, + "learning_rate": 1.3129582259993735e-05, + "loss": 0.8528, + "step": 8394 + }, + { + "epoch": 3.8055303717135085, + "grad_norm": 0.16830487557976112, + "learning_rate": 1.3120206381018332e-05, + "loss": 0.8647, + "step": 8395 + }, + { + "epoch": 3.8059836808703533, + "grad_norm": 0.1577112766285116, + "learning_rate": 1.3110833194133243e-05, + "loss": 0.858, + "step": 8396 + }, + { + "epoch": 3.8064369900271986, + "grad_norm": 0.15527955639682708, + "learning_rate": 1.3101462700277248e-05, + "loss": 0.8645, + "step": 8397 + }, + { + "epoch": 3.8068902991840434, + "grad_norm": 0.19460891753323062, + "learning_rate": 1.3092094900388816e-05, + "loss": 0.8589, + "step": 8398 + }, + { + "epoch": 3.8073436083408883, + "grad_norm": 0.1779663265893554, + "learning_rate": 1.308272979540615e-05, + "loss": 0.8808, + "step": 8399 + }, + { + "epoch": 3.8077969174977335, + "grad_norm": 0.21100098348289456, + "learning_rate": 1.3073367386267214e-05, + "loss": 0.8691, + "step": 8400 + }, + { + "epoch": 3.8082502266545784, + "grad_norm": 0.22608582350690823, + "learning_rate": 1.306400767390966e-05, + "loss": 0.8728, + "step": 8401 + }, + { + "epoch": 3.808703535811423, + "grad_norm": 0.16307557618159524, + "learning_rate": 1.3054650659270914e-05, + "loss": 0.8835, + "step": 8402 + }, + { + "epoch": 3.8091568449682685, + "grad_norm": 0.18682196587944325, + "learning_rate": 1.3045296343288079e-05, + "loss": 0.8687, + "step": 8403 + }, + { + "epoch": 3.8096101541251133, + "grad_norm": 0.13815895635697478, + "learning_rate": 1.3035944726898051e-05, + "loss": 0.8399, + "step": 8404 + }, + { + "epoch": 3.810063463281958, + "grad_norm": 0.17853862966677186, + "learning_rate": 1.3026595811037387e-05, + "loss": 0.8577, + "step": 8405 + }, + { + "epoch": 3.8105167724388034, + "grad_norm": 0.17811593753319246, + "learning_rate": 1.3017249596642443e-05, + "loss": 0.8456, + "step": 8406 + }, + { + "epoch": 3.810970081595648, + "grad_norm": 0.15056544695878224, + "learning_rate": 1.3007906084649235e-05, + "loss": 0.8564, + "step": 8407 + }, + { + "epoch": 3.811423390752493, + "grad_norm": 0.19896374246323925, + "learning_rate": 1.2998565275993577e-05, + "loss": 0.8721, + "step": 8408 + }, + { + "epoch": 3.8118766999093383, + "grad_norm": 0.15848546826010843, + "learning_rate": 1.2989227171610965e-05, + "loss": 0.8597, + "step": 8409 + }, + { + "epoch": 3.812330009066183, + "grad_norm": 0.19067206743759088, + "learning_rate": 1.2979891772436629e-05, + "loss": 0.8843, + "step": 8410 + }, + { + "epoch": 3.812783318223028, + "grad_norm": 0.17150509340348072, + "learning_rate": 1.2970559079405534e-05, + "loss": 0.8624, + "step": 8411 + }, + { + "epoch": 3.8132366273798732, + "grad_norm": 0.17537138185544382, + "learning_rate": 1.2961229093452396e-05, + "loss": 0.8407, + "step": 8412 + }, + { + "epoch": 3.813689936536718, + "grad_norm": 0.15442847953924654, + "learning_rate": 1.2951901815511629e-05, + "loss": 0.866, + "step": 8413 + }, + { + "epoch": 3.814143245693563, + "grad_norm": 0.1786161235905587, + "learning_rate": 1.2942577246517378e-05, + "loss": 0.8623, + "step": 8414 + }, + { + "epoch": 3.814596554850408, + "grad_norm": 0.18495794079981936, + "learning_rate": 1.293325538740355e-05, + "loss": 0.8864, + "step": 8415 + }, + { + "epoch": 3.815049864007253, + "grad_norm": 0.16089857429477591, + "learning_rate": 1.2923936239103725e-05, + "loss": 0.8723, + "step": 8416 + }, + { + "epoch": 3.815503173164098, + "grad_norm": 0.16953974561460325, + "learning_rate": 1.2914619802551278e-05, + "loss": 0.8329, + "step": 8417 + }, + { + "epoch": 3.815956482320943, + "grad_norm": 0.17502850552483193, + "learning_rate": 1.2905306078679245e-05, + "loss": 0.856, + "step": 8418 + }, + { + "epoch": 3.816409791477788, + "grad_norm": 0.15575705356666836, + "learning_rate": 1.2895995068420448e-05, + "loss": 0.8823, + "step": 8419 + }, + { + "epoch": 3.8168631006346327, + "grad_norm": 0.23652419248819942, + "learning_rate": 1.2886686772707395e-05, + "loss": 0.8752, + "step": 8420 + }, + { + "epoch": 3.817316409791478, + "grad_norm": 0.15010643315483477, + "learning_rate": 1.2877381192472358e-05, + "loss": 0.8535, + "step": 8421 + }, + { + "epoch": 3.817769718948323, + "grad_norm": 0.1949655391009599, + "learning_rate": 1.2868078328647289e-05, + "loss": 0.8468, + "step": 8422 + }, + { + "epoch": 3.8182230281051677, + "grad_norm": 0.17880365756434383, + "learning_rate": 1.2858778182163932e-05, + "loss": 0.8596, + "step": 8423 + }, + { + "epoch": 3.818676337262013, + "grad_norm": 0.14316967559978466, + "learning_rate": 1.2849480753953704e-05, + "loss": 0.8607, + "step": 8424 + }, + { + "epoch": 3.8191296464188578, + "grad_norm": 0.1495833552399565, + "learning_rate": 1.2840186044947762e-05, + "loss": 0.8656, + "step": 8425 + }, + { + "epoch": 3.8195829555757026, + "grad_norm": 0.15965044946854048, + "learning_rate": 1.2830894056077021e-05, + "loss": 0.8656, + "step": 8426 + }, + { + "epoch": 3.820036264732548, + "grad_norm": 0.1763881581501009, + "learning_rate": 1.2821604788272075e-05, + "loss": 0.8782, + "step": 8427 + }, + { + "epoch": 3.8204895738893927, + "grad_norm": 0.15161062682095794, + "learning_rate": 1.28123182424633e-05, + "loss": 0.8664, + "step": 8428 + }, + { + "epoch": 3.8209428830462375, + "grad_norm": 0.1468231396514731, + "learning_rate": 1.2803034419580756e-05, + "loss": 0.8795, + "step": 8429 + }, + { + "epoch": 3.8213961922030824, + "grad_norm": 0.1442005908952038, + "learning_rate": 1.2793753320554241e-05, + "loss": 0.8863, + "step": 8430 + }, + { + "epoch": 3.8218495013599276, + "grad_norm": 0.14217102383799893, + "learning_rate": 1.2784474946313279e-05, + "loss": 0.8817, + "step": 8431 + }, + { + "epoch": 3.8223028105167725, + "grad_norm": 0.15583091903097898, + "learning_rate": 1.2775199297787148e-05, + "loss": 0.8744, + "step": 8432 + }, + { + "epoch": 3.8227561196736173, + "grad_norm": 0.13002418479753983, + "learning_rate": 1.276592637590481e-05, + "loss": 0.8608, + "step": 8433 + }, + { + "epoch": 3.823209428830462, + "grad_norm": 0.17362126025606278, + "learning_rate": 1.2756656181594998e-05, + "loss": 0.8733, + "step": 8434 + }, + { + "epoch": 3.8236627379873074, + "grad_norm": 0.14286604594117694, + "learning_rate": 1.2747388715786117e-05, + "loss": 0.857, + "step": 8435 + }, + { + "epoch": 3.824116047144152, + "grad_norm": 0.17368469679785703, + "learning_rate": 1.2738123979406369e-05, + "loss": 0.8623, + "step": 8436 + }, + { + "epoch": 3.824569356300997, + "grad_norm": 0.17456008511617532, + "learning_rate": 1.2728861973383611e-05, + "loss": 0.8733, + "step": 8437 + }, + { + "epoch": 3.8250226654578423, + "grad_norm": 0.14747243264649143, + "learning_rate": 1.2719602698645485e-05, + "loss": 0.8686, + "step": 8438 + }, + { + "epoch": 3.825475974614687, + "grad_norm": 0.1673701682494268, + "learning_rate": 1.2710346156119324e-05, + "loss": 0.8786, + "step": 8439 + }, + { + "epoch": 3.825929283771532, + "grad_norm": 0.14546830098216806, + "learning_rate": 1.2701092346732189e-05, + "loss": 0.8606, + "step": 8440 + }, + { + "epoch": 3.8263825929283772, + "grad_norm": 0.20244506645832858, + "learning_rate": 1.2691841271410898e-05, + "loss": 0.8682, + "step": 8441 + }, + { + "epoch": 3.826835902085222, + "grad_norm": 0.1768573931463237, + "learning_rate": 1.2682592931081947e-05, + "loss": 0.8597, + "step": 8442 + }, + { + "epoch": 3.827289211242067, + "grad_norm": 0.14651519789352568, + "learning_rate": 1.2673347326671612e-05, + "loss": 0.8606, + "step": 8443 + }, + { + "epoch": 3.827742520398912, + "grad_norm": 0.17421728879657936, + "learning_rate": 1.2664104459105837e-05, + "loss": 0.8661, + "step": 8444 + }, + { + "epoch": 3.828195829555757, + "grad_norm": 0.1716167331177481, + "learning_rate": 1.2654864329310357e-05, + "loss": 0.8697, + "step": 8445 + }, + { + "epoch": 3.828649138712602, + "grad_norm": 0.1752100033621081, + "learning_rate": 1.2645626938210565e-05, + "loss": 0.8671, + "step": 8446 + }, + { + "epoch": 3.829102447869447, + "grad_norm": 0.12784280213535143, + "learning_rate": 1.263639228673164e-05, + "loss": 0.8669, + "step": 8447 + }, + { + "epoch": 3.829555757026292, + "grad_norm": 0.153180970175782, + "learning_rate": 1.2627160375798448e-05, + "loss": 0.8718, + "step": 8448 + }, + { + "epoch": 3.8300090661831367, + "grad_norm": 0.13666503894337376, + "learning_rate": 1.2617931206335596e-05, + "loss": 0.8702, + "step": 8449 + }, + { + "epoch": 3.830462375339982, + "grad_norm": 0.1638657383786323, + "learning_rate": 1.2608704779267394e-05, + "loss": 0.868, + "step": 8450 + }, + { + "epoch": 3.830915684496827, + "grad_norm": 0.1447411120862144, + "learning_rate": 1.2599481095517918e-05, + "loss": 0.8565, + "step": 8451 + }, + { + "epoch": 3.8313689936536717, + "grad_norm": 0.1399450860875307, + "learning_rate": 1.2590260156010929e-05, + "loss": 0.8478, + "step": 8452 + }, + { + "epoch": 3.831822302810517, + "grad_norm": 0.15337928247368912, + "learning_rate": 1.2581041961669955e-05, + "loss": 0.8854, + "step": 8453 + }, + { + "epoch": 3.8322756119673618, + "grad_norm": 0.14906802981387016, + "learning_rate": 1.2571826513418194e-05, + "loss": 0.894, + "step": 8454 + }, + { + "epoch": 3.8327289211242066, + "grad_norm": 0.12942424243130418, + "learning_rate": 1.2562613812178635e-05, + "loss": 0.8619, + "step": 8455 + }, + { + "epoch": 3.833182230281052, + "grad_norm": 0.15641236086797938, + "learning_rate": 1.255340385887394e-05, + "loss": 0.8591, + "step": 8456 + }, + { + "epoch": 3.8336355394378967, + "grad_norm": 0.13888806797451958, + "learning_rate": 1.25441966544265e-05, + "loss": 0.8665, + "step": 8457 + }, + { + "epoch": 3.8340888485947415, + "grad_norm": 0.14944400328408458, + "learning_rate": 1.2534992199758463e-05, + "loss": 0.858, + "step": 8458 + }, + { + "epoch": 3.834542157751587, + "grad_norm": 0.11996321298360806, + "learning_rate": 1.252579049579167e-05, + "loss": 0.8303, + "step": 8459 + }, + { + "epoch": 3.8349954669084316, + "grad_norm": 0.14294776848768556, + "learning_rate": 1.2516591543447713e-05, + "loss": 0.8521, + "step": 8460 + }, + { + "epoch": 3.8354487760652765, + "grad_norm": 0.13241143975296418, + "learning_rate": 1.250739534364787e-05, + "loss": 0.8654, + "step": 8461 + }, + { + "epoch": 3.8359020852221217, + "grad_norm": 0.12957744818835207, + "learning_rate": 1.2498201897313199e-05, + "loss": 0.8801, + "step": 8462 + }, + { + "epoch": 3.8363553943789666, + "grad_norm": 0.14310970424251077, + "learning_rate": 1.2489011205364422e-05, + "loss": 0.8922, + "step": 8463 + }, + { + "epoch": 3.8368087035358114, + "grad_norm": 0.13281469585461375, + "learning_rate": 1.2479823268722036e-05, + "loss": 0.8526, + "step": 8464 + }, + { + "epoch": 3.8372620126926567, + "grad_norm": 0.14402474480551597, + "learning_rate": 1.2470638088306234e-05, + "loss": 0.8803, + "step": 8465 + }, + { + "epoch": 3.8377153218495015, + "grad_norm": 0.14803400725253843, + "learning_rate": 1.2461455665036918e-05, + "loss": 0.8699, + "step": 8466 + }, + { + "epoch": 3.8381686310063463, + "grad_norm": 0.15172275193101936, + "learning_rate": 1.2452275999833767e-05, + "loss": 0.8678, + "step": 8467 + }, + { + "epoch": 3.838621940163191, + "grad_norm": 0.15293945100876574, + "learning_rate": 1.2443099093616136e-05, + "loss": 0.8659, + "step": 8468 + }, + { + "epoch": 3.8390752493200364, + "grad_norm": 0.14190581435577082, + "learning_rate": 1.2433924947303102e-05, + "loss": 0.8797, + "step": 8469 + }, + { + "epoch": 3.8395285584768812, + "grad_norm": 0.1853348438393179, + "learning_rate": 1.2424753561813518e-05, + "loss": 0.8707, + "step": 8470 + }, + { + "epoch": 3.839981867633726, + "grad_norm": 0.13950077044115547, + "learning_rate": 1.2415584938065904e-05, + "loss": 0.8653, + "step": 8471 + }, + { + "epoch": 3.840435176790571, + "grad_norm": 0.1504116677845372, + "learning_rate": 1.2406419076978512e-05, + "loss": 0.8515, + "step": 8472 + }, + { + "epoch": 3.840888485947416, + "grad_norm": 0.1693240117603649, + "learning_rate": 1.2397255979469365e-05, + "loss": 0.8674, + "step": 8473 + }, + { + "epoch": 3.841341795104261, + "grad_norm": 0.14900318114247985, + "learning_rate": 1.2388095646456137e-05, + "loss": 0.8718, + "step": 8474 + }, + { + "epoch": 3.841795104261106, + "grad_norm": 0.16640586155248255, + "learning_rate": 1.2378938078856292e-05, + "loss": 0.877, + "step": 8475 + }, + { + "epoch": 3.842248413417951, + "grad_norm": 0.13934250325095954, + "learning_rate": 1.236978327758697e-05, + "loss": 0.8674, + "step": 8476 + }, + { + "epoch": 3.842701722574796, + "grad_norm": 0.14234067465217976, + "learning_rate": 1.2360631243565062e-05, + "loss": 0.8568, + "step": 8477 + }, + { + "epoch": 3.8431550317316407, + "grad_norm": 0.1346896316705787, + "learning_rate": 1.2351481977707151e-05, + "loss": 0.8739, + "step": 8478 + }, + { + "epoch": 3.843608340888486, + "grad_norm": 0.15750083004728233, + "learning_rate": 1.2342335480929598e-05, + "loss": 0.8739, + "step": 8479 + }, + { + "epoch": 3.844061650045331, + "grad_norm": 0.12967481242027526, + "learning_rate": 1.2333191754148413e-05, + "loss": 0.8811, + "step": 8480 + }, + { + "epoch": 3.8445149592021757, + "grad_norm": 0.4505920311839128, + "learning_rate": 1.2324050798279394e-05, + "loss": 0.8967, + "step": 8481 + }, + { + "epoch": 3.844968268359021, + "grad_norm": 0.14367519760147532, + "learning_rate": 1.2314912614238033e-05, + "loss": 0.8944, + "step": 8482 + }, + { + "epoch": 3.8454215775158658, + "grad_norm": 0.17737118831045182, + "learning_rate": 1.2305777202939523e-05, + "loss": 0.8659, + "step": 8483 + }, + { + "epoch": 3.8458748866727106, + "grad_norm": 0.14817261489705646, + "learning_rate": 1.229664456529883e-05, + "loss": 0.8796, + "step": 8484 + }, + { + "epoch": 3.846328195829556, + "grad_norm": 0.1392670931703586, + "learning_rate": 1.2287514702230592e-05, + "loss": 0.8635, + "step": 8485 + }, + { + "epoch": 3.8467815049864007, + "grad_norm": 0.13851694059156047, + "learning_rate": 1.2278387614649225e-05, + "loss": 0.8529, + "step": 8486 + }, + { + "epoch": 3.8472348141432455, + "grad_norm": 0.15651937586249276, + "learning_rate": 1.2269263303468786e-05, + "loss": 0.872, + "step": 8487 + }, + { + "epoch": 3.847688123300091, + "grad_norm": 0.14422516043012434, + "learning_rate": 1.2260141769603142e-05, + "loss": 0.8716, + "step": 8488 + }, + { + "epoch": 3.8481414324569356, + "grad_norm": 0.21481379193754693, + "learning_rate": 1.2251023013965808e-05, + "loss": 0.8839, + "step": 8489 + }, + { + "epoch": 3.8485947416137805, + "grad_norm": 0.13315848981266973, + "learning_rate": 1.224190703747009e-05, + "loss": 0.8731, + "step": 8490 + }, + { + "epoch": 3.8490480507706257, + "grad_norm": 0.1469268395121675, + "learning_rate": 1.2232793841028942e-05, + "loss": 0.8521, + "step": 8491 + }, + { + "epoch": 3.8495013599274706, + "grad_norm": 0.13676164710756258, + "learning_rate": 1.2223683425555116e-05, + "loss": 0.8775, + "step": 8492 + }, + { + "epoch": 3.8499546690843154, + "grad_norm": 0.17514895990524365, + "learning_rate": 1.2214575791961015e-05, + "loss": 0.8698, + "step": 8493 + }, + { + "epoch": 3.8504079782411607, + "grad_norm": 0.13934411086250226, + "learning_rate": 1.2205470941158818e-05, + "loss": 0.854, + "step": 8494 + }, + { + "epoch": 3.8508612873980055, + "grad_norm": 0.1320436681160387, + "learning_rate": 1.2196368874060381e-05, + "loss": 0.8666, + "step": 8495 + }, + { + "epoch": 3.8513145965548503, + "grad_norm": 0.15798185164080897, + "learning_rate": 1.2187269591577327e-05, + "loss": 0.8632, + "step": 8496 + }, + { + "epoch": 3.8517679057116956, + "grad_norm": 0.15112294684968328, + "learning_rate": 1.2178173094620962e-05, + "loss": 0.8416, + "step": 8497 + }, + { + "epoch": 3.8522212148685404, + "grad_norm": 0.15933302196830004, + "learning_rate": 1.2169079384102319e-05, + "loss": 0.8793, + "step": 8498 + }, + { + "epoch": 3.8526745240253852, + "grad_norm": 0.17033768708730612, + "learning_rate": 1.2159988460932181e-05, + "loss": 0.8713, + "step": 8499 + }, + { + "epoch": 3.8531278331822305, + "grad_norm": 0.13366994534847204, + "learning_rate": 1.2150900326021007e-05, + "loss": 0.8669, + "step": 8500 + }, + { + "epoch": 3.8535811423390753, + "grad_norm": 0.15964847115033573, + "learning_rate": 1.214181498027902e-05, + "loss": 0.8734, + "step": 8501 + }, + { + "epoch": 3.85403445149592, + "grad_norm": 0.1473295387284055, + "learning_rate": 1.2132732424616132e-05, + "loss": 0.8568, + "step": 8502 + }, + { + "epoch": 3.8544877606527654, + "grad_norm": 0.1503125010453061, + "learning_rate": 1.2123652659941998e-05, + "loss": 0.8594, + "step": 8503 + }, + { + "epoch": 3.8549410698096103, + "grad_norm": 0.16910627635784217, + "learning_rate": 1.2114575687165969e-05, + "loss": 0.8736, + "step": 8504 + }, + { + "epoch": 3.855394378966455, + "grad_norm": 0.14369406508842217, + "learning_rate": 1.2105501507197146e-05, + "loss": 0.8712, + "step": 8505 + }, + { + "epoch": 3.8558476881233004, + "grad_norm": 0.1714415040807548, + "learning_rate": 1.2096430120944333e-05, + "loss": 0.8592, + "step": 8506 + }, + { + "epoch": 3.856300997280145, + "grad_norm": 0.14220689439227696, + "learning_rate": 1.2087361529316048e-05, + "loss": 0.8773, + "step": 8507 + }, + { + "epoch": 3.85675430643699, + "grad_norm": 0.17540741181607364, + "learning_rate": 1.2078295733220529e-05, + "loss": 0.8773, + "step": 8508 + }, + { + "epoch": 3.857207615593835, + "grad_norm": 0.13502347166103446, + "learning_rate": 1.2069232733565763e-05, + "loss": 0.8665, + "step": 8509 + }, + { + "epoch": 3.85766092475068, + "grad_norm": 0.1656901465039133, + "learning_rate": 1.2060172531259414e-05, + "loss": 0.8711, + "step": 8510 + }, + { + "epoch": 3.858114233907525, + "grad_norm": 0.13469689144603061, + "learning_rate": 1.2051115127208907e-05, + "loss": 0.8428, + "step": 8511 + }, + { + "epoch": 3.8585675430643698, + "grad_norm": 0.1509541877849702, + "learning_rate": 1.2042060522321366e-05, + "loss": 0.8471, + "step": 8512 + }, + { + "epoch": 3.8590208522212146, + "grad_norm": 0.14258727332432833, + "learning_rate": 1.2033008717503619e-05, + "loss": 0.8756, + "step": 8513 + }, + { + "epoch": 3.85947416137806, + "grad_norm": 0.14307910559329556, + "learning_rate": 1.2023959713662255e-05, + "loss": 0.8512, + "step": 8514 + }, + { + "epoch": 3.8599274705349047, + "grad_norm": 0.12504802153071046, + "learning_rate": 1.2014913511703528e-05, + "loss": 0.8687, + "step": 8515 + }, + { + "epoch": 3.8603807796917495, + "grad_norm": 0.18156515128316184, + "learning_rate": 1.2005870112533478e-05, + "loss": 0.8823, + "step": 8516 + }, + { + "epoch": 3.860834088848595, + "grad_norm": 0.13855156095204954, + "learning_rate": 1.1996829517057793e-05, + "loss": 0.8866, + "step": 8517 + }, + { + "epoch": 3.8612873980054396, + "grad_norm": 0.16523724144244042, + "learning_rate": 1.1987791726181945e-05, + "loss": 0.8614, + "step": 8518 + }, + { + "epoch": 3.8617407071622845, + "grad_norm": 0.13126751811266654, + "learning_rate": 1.1978756740811068e-05, + "loss": 0.8545, + "step": 8519 + }, + { + "epoch": 3.8621940163191297, + "grad_norm": 0.15297004266123265, + "learning_rate": 1.1969724561850073e-05, + "loss": 0.8645, + "step": 8520 + }, + { + "epoch": 3.8626473254759746, + "grad_norm": 0.17175040706124806, + "learning_rate": 1.1960695190203527e-05, + "loss": 0.8508, + "step": 8521 + }, + { + "epoch": 3.8631006346328194, + "grad_norm": 0.1630344399342654, + "learning_rate": 1.1951668626775779e-05, + "loss": 0.8915, + "step": 8522 + }, + { + "epoch": 3.8635539437896647, + "grad_norm": 0.17575601952622028, + "learning_rate": 1.1942644872470846e-05, + "loss": 0.8512, + "step": 8523 + }, + { + "epoch": 3.8640072529465095, + "grad_norm": 0.17118434869652346, + "learning_rate": 1.193362392819248e-05, + "loss": 0.8761, + "step": 8524 + }, + { + "epoch": 3.8644605621033543, + "grad_norm": 0.1750100283300416, + "learning_rate": 1.192460579484417e-05, + "loss": 0.8902, + "step": 8525 + }, + { + "epoch": 3.8649138712601996, + "grad_norm": 0.1587292025321191, + "learning_rate": 1.1915590473329108e-05, + "loss": 0.8817, + "step": 8526 + }, + { + "epoch": 3.8653671804170444, + "grad_norm": 0.16564848832728762, + "learning_rate": 1.1906577964550196e-05, + "loss": 0.8609, + "step": 8527 + }, + { + "epoch": 3.8658204895738892, + "grad_norm": 0.1903845045538452, + "learning_rate": 1.189756826941005e-05, + "loss": 0.8486, + "step": 8528 + }, + { + "epoch": 3.8662737987307345, + "grad_norm": 0.17832396191452124, + "learning_rate": 1.1888561388811054e-05, + "loss": 0.8716, + "step": 8529 + }, + { + "epoch": 3.8667271078875793, + "grad_norm": 0.1831648609260826, + "learning_rate": 1.1879557323655236e-05, + "loss": 0.8537, + "step": 8530 + }, + { + "epoch": 3.867180417044424, + "grad_norm": 0.1808121827857336, + "learning_rate": 1.1870556074844406e-05, + "loss": 0.8704, + "step": 8531 + }, + { + "epoch": 3.8676337262012694, + "grad_norm": 0.1838963716959412, + "learning_rate": 1.1861557643280048e-05, + "loss": 0.8646, + "step": 8532 + }, + { + "epoch": 3.8680870353581143, + "grad_norm": 0.1789993688659228, + "learning_rate": 1.1852562029863397e-05, + "loss": 0.8762, + "step": 8533 + }, + { + "epoch": 3.868540344514959, + "grad_norm": 0.18993264401190316, + "learning_rate": 1.1843569235495376e-05, + "loss": 0.8462, + "step": 8534 + }, + { + "epoch": 3.8689936536718044, + "grad_norm": 0.14016244900659805, + "learning_rate": 1.1834579261076655e-05, + "loss": 0.8744, + "step": 8535 + }, + { + "epoch": 3.869446962828649, + "grad_norm": 0.20038580582828347, + "learning_rate": 1.1825592107507586e-05, + "loss": 0.8616, + "step": 8536 + }, + { + "epoch": 3.869900271985494, + "grad_norm": 0.15782183435909786, + "learning_rate": 1.181660777568828e-05, + "loss": 0.8681, + "step": 8537 + }, + { + "epoch": 3.8703535811423393, + "grad_norm": 0.16541764110380736, + "learning_rate": 1.1807626266518542e-05, + "loss": 0.8642, + "step": 8538 + }, + { + "epoch": 3.870806890299184, + "grad_norm": 0.174609802471546, + "learning_rate": 1.1798647580897868e-05, + "loss": 0.8932, + "step": 8539 + }, + { + "epoch": 3.871260199456029, + "grad_norm": 0.14786363462034463, + "learning_rate": 1.1789671719725541e-05, + "loss": 0.855, + "step": 8540 + }, + { + "epoch": 3.871713508612874, + "grad_norm": 0.15908804066511234, + "learning_rate": 1.1780698683900482e-05, + "loss": 0.8638, + "step": 8541 + }, + { + "epoch": 3.872166817769719, + "grad_norm": 0.1727451354405276, + "learning_rate": 1.1771728474321402e-05, + "loss": 0.8711, + "step": 8542 + }, + { + "epoch": 3.872620126926564, + "grad_norm": 0.1700045664180198, + "learning_rate": 1.1762761091886663e-05, + "loss": 0.8748, + "step": 8543 + }, + { + "epoch": 3.873073436083409, + "grad_norm": 0.1601759429878176, + "learning_rate": 1.1753796537494408e-05, + "loss": 0.8639, + "step": 8544 + }, + { + "epoch": 3.873526745240254, + "grad_norm": 0.19445915876048897, + "learning_rate": 1.1744834812042427e-05, + "loss": 0.8808, + "step": 8545 + }, + { + "epoch": 3.873980054397099, + "grad_norm": 0.1440379134539822, + "learning_rate": 1.1735875916428285e-05, + "loss": 0.864, + "step": 8546 + }, + { + "epoch": 3.8744333635539436, + "grad_norm": 0.18840326966296078, + "learning_rate": 1.172691985154923e-05, + "loss": 0.8703, + "step": 8547 + }, + { + "epoch": 3.874886672710789, + "grad_norm": 0.13977817513409788, + "learning_rate": 1.1717966618302255e-05, + "loss": 0.8689, + "step": 8548 + }, + { + "epoch": 3.8753399818676337, + "grad_norm": 0.15809465368410877, + "learning_rate": 1.1709016217584029e-05, + "loss": 0.8682, + "step": 8549 + }, + { + "epoch": 3.8757932910244786, + "grad_norm": 0.13596875763531366, + "learning_rate": 1.170006865029099e-05, + "loss": 0.8585, + "step": 8550 + }, + { + "epoch": 3.8762466001813234, + "grad_norm": 0.12616029531955444, + "learning_rate": 1.1691123917319231e-05, + "loss": 0.8564, + "step": 8551 + }, + { + "epoch": 3.8766999093381687, + "grad_norm": 0.14708005932217472, + "learning_rate": 1.1682182019564627e-05, + "loss": 0.8503, + "step": 8552 + }, + { + "epoch": 3.8771532184950135, + "grad_norm": 0.15022683417263874, + "learning_rate": 1.1673242957922715e-05, + "loss": 0.8517, + "step": 8553 + }, + { + "epoch": 3.8776065276518583, + "grad_norm": 0.15138622593924986, + "learning_rate": 1.166430673328876e-05, + "loss": 0.8756, + "step": 8554 + }, + { + "epoch": 3.8780598368087036, + "grad_norm": 0.13028247846472824, + "learning_rate": 1.1655373346557775e-05, + "loss": 0.8593, + "step": 8555 + }, + { + "epoch": 3.8785131459655484, + "grad_norm": 0.18259703377915587, + "learning_rate": 1.164644279862444e-05, + "loss": 0.865, + "step": 8556 + }, + { + "epoch": 3.8789664551223932, + "grad_norm": 0.13044668182078814, + "learning_rate": 1.16375150903832e-05, + "loss": 0.8804, + "step": 8557 + }, + { + "epoch": 3.8794197642792385, + "grad_norm": 0.13767337885449746, + "learning_rate": 1.1628590222728167e-05, + "loss": 0.8786, + "step": 8558 + }, + { + "epoch": 3.8798730734360833, + "grad_norm": 0.1532323720418233, + "learning_rate": 1.161966819655322e-05, + "loss": 0.8524, + "step": 8559 + }, + { + "epoch": 3.880326382592928, + "grad_norm": 0.1362610757679243, + "learning_rate": 1.16107490127519e-05, + "loss": 0.8921, + "step": 8560 + }, + { + "epoch": 3.8807796917497734, + "grad_norm": 0.1456314687862223, + "learning_rate": 1.160183267221751e-05, + "loss": 0.875, + "step": 8561 + }, + { + "epoch": 3.8812330009066183, + "grad_norm": 0.14848627750090887, + "learning_rate": 1.1592919175843024e-05, + "loss": 0.8684, + "step": 8562 + }, + { + "epoch": 3.881686310063463, + "grad_norm": 0.14651163650805346, + "learning_rate": 1.15840085245212e-05, + "loss": 0.8655, + "step": 8563 + }, + { + "epoch": 3.8821396192203084, + "grad_norm": 0.17141817535174908, + "learning_rate": 1.157510071914441e-05, + "loss": 0.8753, + "step": 8564 + }, + { + "epoch": 3.882592928377153, + "grad_norm": 0.1524855732966824, + "learning_rate": 1.156619576060483e-05, + "loss": 0.856, + "step": 8565 + }, + { + "epoch": 3.883046237533998, + "grad_norm": 0.1597413611713039, + "learning_rate": 1.15572936497943e-05, + "loss": 0.8654, + "step": 8566 + }, + { + "epoch": 3.8834995466908433, + "grad_norm": 0.14924542163845025, + "learning_rate": 1.1548394387604414e-05, + "loss": 0.8669, + "step": 8567 + }, + { + "epoch": 3.883952855847688, + "grad_norm": 0.14669140809997924, + "learning_rate": 1.1539497974926452e-05, + "loss": 0.8604, + "step": 8568 + }, + { + "epoch": 3.884406165004533, + "grad_norm": 0.15551803354483396, + "learning_rate": 1.1530604412651396e-05, + "loss": 0.8749, + "step": 8569 + }, + { + "epoch": 3.8848594741613782, + "grad_norm": 0.16844979868748106, + "learning_rate": 1.152171370166999e-05, + "loss": 0.8676, + "step": 8570 + }, + { + "epoch": 3.885312783318223, + "grad_norm": 0.13388435165443519, + "learning_rate": 1.1512825842872641e-05, + "loss": 0.853, + "step": 8571 + }, + { + "epoch": 3.885766092475068, + "grad_norm": 0.19886804467238703, + "learning_rate": 1.1503940837149519e-05, + "loss": 0.8486, + "step": 8572 + }, + { + "epoch": 3.886219401631913, + "grad_norm": 0.16392554226920278, + "learning_rate": 1.1495058685390456e-05, + "loss": 0.8565, + "step": 8573 + }, + { + "epoch": 3.886672710788758, + "grad_norm": 0.1908412024209008, + "learning_rate": 1.1486179388485056e-05, + "loss": 0.8436, + "step": 8574 + }, + { + "epoch": 3.887126019945603, + "grad_norm": 0.1809576283193221, + "learning_rate": 1.1477302947322575e-05, + "loss": 0.8546, + "step": 8575 + }, + { + "epoch": 3.887579329102448, + "grad_norm": 0.1840250425086609, + "learning_rate": 1.1468429362792044e-05, + "loss": 0.8367, + "step": 8576 + }, + { + "epoch": 3.888032638259293, + "grad_norm": 0.1620295858694924, + "learning_rate": 1.1459558635782156e-05, + "loss": 0.8614, + "step": 8577 + }, + { + "epoch": 3.8884859474161377, + "grad_norm": 0.15530899910928928, + "learning_rate": 1.1450690767181359e-05, + "loss": 0.8742, + "step": 8578 + }, + { + "epoch": 3.888939256572983, + "grad_norm": 0.16991936030445193, + "learning_rate": 1.1441825757877782e-05, + "loss": 0.8623, + "step": 8579 + }, + { + "epoch": 3.889392565729828, + "grad_norm": 0.14156527506884914, + "learning_rate": 1.1432963608759282e-05, + "loss": 0.8802, + "step": 8580 + }, + { + "epoch": 3.8898458748866727, + "grad_norm": 0.17879721678123933, + "learning_rate": 1.142410432071344e-05, + "loss": 0.8774, + "step": 8581 + }, + { + "epoch": 3.890299184043518, + "grad_norm": 0.15062293265528467, + "learning_rate": 1.141524789462754e-05, + "loss": 0.8612, + "step": 8582 + }, + { + "epoch": 3.8907524932003628, + "grad_norm": 0.14560322398625078, + "learning_rate": 1.1406394331388566e-05, + "loss": 0.8648, + "step": 8583 + }, + { + "epoch": 3.8912058023572076, + "grad_norm": 0.199790680175622, + "learning_rate": 1.1397543631883226e-05, + "loss": 0.8629, + "step": 8584 + }, + { + "epoch": 3.891659111514053, + "grad_norm": 0.22817501782185273, + "learning_rate": 1.138869579699796e-05, + "loss": 0.8782, + "step": 8585 + }, + { + "epoch": 3.8921124206708977, + "grad_norm": 0.18231128429813523, + "learning_rate": 1.137985082761889e-05, + "loss": 0.8546, + "step": 8586 + }, + { + "epoch": 3.8925657298277425, + "grad_norm": 0.12974485793526946, + "learning_rate": 1.1371008724631882e-05, + "loss": 0.8618, + "step": 8587 + }, + { + "epoch": 3.8930190389845873, + "grad_norm": 0.17752485825500627, + "learning_rate": 1.1362169488922477e-05, + "loss": 0.8527, + "step": 8588 + }, + { + "epoch": 3.8934723481414326, + "grad_norm": 0.15337039781296607, + "learning_rate": 1.1353333121375978e-05, + "loss": 0.8526, + "step": 8589 + }, + { + "epoch": 3.8939256572982774, + "grad_norm": 0.15101320556964593, + "learning_rate": 1.1344499622877342e-05, + "loss": 0.8762, + "step": 8590 + }, + { + "epoch": 3.8943789664551223, + "grad_norm": 0.16858834950795226, + "learning_rate": 1.1335668994311297e-05, + "loss": 0.8388, + "step": 8591 + }, + { + "epoch": 3.894832275611967, + "grad_norm": 0.15444611559545607, + "learning_rate": 1.1326841236562238e-05, + "loss": 0.8565, + "step": 8592 + }, + { + "epoch": 3.8952855847688124, + "grad_norm": 0.20540609359405804, + "learning_rate": 1.1318016350514305e-05, + "loss": 0.8587, + "step": 8593 + }, + { + "epoch": 3.895738893925657, + "grad_norm": 0.1588827793384538, + "learning_rate": 1.1309194337051332e-05, + "loss": 0.871, + "step": 8594 + }, + { + "epoch": 3.896192203082502, + "grad_norm": 0.2058557606191269, + "learning_rate": 1.1300375197056855e-05, + "loss": 0.871, + "step": 8595 + }, + { + "epoch": 3.8966455122393473, + "grad_norm": 0.1584655587353434, + "learning_rate": 1.129155893141416e-05, + "loss": 0.8714, + "step": 8596 + }, + { + "epoch": 3.897098821396192, + "grad_norm": 0.1583999196916172, + "learning_rate": 1.1282745541006199e-05, + "loss": 0.8682, + "step": 8597 + }, + { + "epoch": 3.897552130553037, + "grad_norm": 0.16909527097111882, + "learning_rate": 1.1273935026715681e-05, + "loss": 0.8597, + "step": 8598 + }, + { + "epoch": 3.8980054397098822, + "grad_norm": 0.15728442861827907, + "learning_rate": 1.1265127389424984e-05, + "loss": 0.8602, + "step": 8599 + }, + { + "epoch": 3.898458748866727, + "grad_norm": 0.16006432979443053, + "learning_rate": 1.1256322630016241e-05, + "loss": 0.8619, + "step": 8600 + }, + { + "epoch": 3.898912058023572, + "grad_norm": 0.17470398578207377, + "learning_rate": 1.124752074937125e-05, + "loss": 0.8583, + "step": 8601 + }, + { + "epoch": 3.899365367180417, + "grad_norm": 0.15191752724006952, + "learning_rate": 1.1238721748371587e-05, + "loss": 0.8485, + "step": 8602 + }, + { + "epoch": 3.899818676337262, + "grad_norm": 0.17950513244402092, + "learning_rate": 1.1229925627898437e-05, + "loss": 0.8597, + "step": 8603 + }, + { + "epoch": 3.900271985494107, + "grad_norm": 0.1328014454514356, + "learning_rate": 1.1221132388832805e-05, + "loss": 0.8587, + "step": 8604 + }, + { + "epoch": 3.900725294650952, + "grad_norm": 0.1879874342912766, + "learning_rate": 1.1212342032055333e-05, + "loss": 0.8624, + "step": 8605 + }, + { + "epoch": 3.901178603807797, + "grad_norm": 0.12766134692546963, + "learning_rate": 1.1203554558446421e-05, + "loss": 0.8738, + "step": 8606 + }, + { + "epoch": 3.9016319129646417, + "grad_norm": 0.19667264806926277, + "learning_rate": 1.1194769968886142e-05, + "loss": 0.8944, + "step": 8607 + }, + { + "epoch": 3.902085222121487, + "grad_norm": 0.1381333012138782, + "learning_rate": 1.1185988264254316e-05, + "loss": 0.8427, + "step": 8608 + }, + { + "epoch": 3.902538531278332, + "grad_norm": 0.15436067209592105, + "learning_rate": 1.1177209445430451e-05, + "loss": 0.8677, + "step": 8609 + }, + { + "epoch": 3.9029918404351767, + "grad_norm": 0.17428225440225506, + "learning_rate": 1.1168433513293753e-05, + "loss": 0.8481, + "step": 8610 + }, + { + "epoch": 3.903445149592022, + "grad_norm": 0.14195531252837665, + "learning_rate": 1.1159660468723188e-05, + "loss": 0.8763, + "step": 8611 + }, + { + "epoch": 3.9038984587488668, + "grad_norm": 0.18874685757473597, + "learning_rate": 1.1150890312597373e-05, + "loss": 0.8574, + "step": 8612 + }, + { + "epoch": 3.9043517679057116, + "grad_norm": 0.13734934948463917, + "learning_rate": 1.1142123045794695e-05, + "loss": 0.8666, + "step": 8613 + }, + { + "epoch": 3.904805077062557, + "grad_norm": 0.18575580485804155, + "learning_rate": 1.1133358669193192e-05, + "loss": 0.8814, + "step": 8614 + }, + { + "epoch": 3.9052583862194017, + "grad_norm": 0.15985811458063653, + "learning_rate": 1.1124597183670667e-05, + "loss": 0.8734, + "step": 8615 + }, + { + "epoch": 3.9057116953762465, + "grad_norm": 0.18095712512015744, + "learning_rate": 1.1115838590104585e-05, + "loss": 0.8624, + "step": 8616 + }, + { + "epoch": 3.906165004533092, + "grad_norm": 0.1572543158421929, + "learning_rate": 1.1107082889372172e-05, + "loss": 0.8573, + "step": 8617 + }, + { + "epoch": 3.9066183136899366, + "grad_norm": 0.16612989100203332, + "learning_rate": 1.1098330082350305e-05, + "loss": 0.8837, + "step": 8618 + }, + { + "epoch": 3.9070716228467814, + "grad_norm": 0.14420542722180585, + "learning_rate": 1.1089580169915638e-05, + "loss": 0.8712, + "step": 8619 + }, + { + "epoch": 3.9075249320036267, + "grad_norm": 0.15357864634479898, + "learning_rate": 1.1080833152944481e-05, + "loss": 0.8906, + "step": 8620 + }, + { + "epoch": 3.9079782411604715, + "grad_norm": 0.14877375207397456, + "learning_rate": 1.1072089032312876e-05, + "loss": 0.8643, + "step": 8621 + }, + { + "epoch": 3.9084315503173164, + "grad_norm": 0.15752010028964175, + "learning_rate": 1.1063347808896561e-05, + "loss": 0.8758, + "step": 8622 + }, + { + "epoch": 3.9088848594741616, + "grad_norm": 0.14396979239672608, + "learning_rate": 1.1054609483571017e-05, + "loss": 0.873, + "step": 8623 + }, + { + "epoch": 3.9093381686310065, + "grad_norm": 0.1410950957461436, + "learning_rate": 1.104587405721139e-05, + "loss": 0.8725, + "step": 8624 + }, + { + "epoch": 3.9097914777878513, + "grad_norm": 0.15325248595014798, + "learning_rate": 1.1037141530692583e-05, + "loss": 0.8732, + "step": 8625 + }, + { + "epoch": 3.910244786944696, + "grad_norm": 0.12398809646344022, + "learning_rate": 1.102841190488917e-05, + "loss": 0.845, + "step": 8626 + }, + { + "epoch": 3.9106980961015414, + "grad_norm": 0.12957609751408997, + "learning_rate": 1.1019685180675439e-05, + "loss": 0.8711, + "step": 8627 + }, + { + "epoch": 3.9111514052583862, + "grad_norm": 0.14059040702806289, + "learning_rate": 1.1010961358925422e-05, + "loss": 0.8841, + "step": 8628 + }, + { + "epoch": 3.911604714415231, + "grad_norm": 0.13405554015259558, + "learning_rate": 1.1002240440512812e-05, + "loss": 0.8502, + "step": 8629 + }, + { + "epoch": 3.912058023572076, + "grad_norm": 0.15257616118796305, + "learning_rate": 1.0993522426311056e-05, + "loss": 0.8719, + "step": 8630 + }, + { + "epoch": 3.912511332728921, + "grad_norm": 0.13954373216288934, + "learning_rate": 1.0984807317193264e-05, + "loss": 0.8725, + "step": 8631 + }, + { + "epoch": 3.912964641885766, + "grad_norm": 0.1343504006699594, + "learning_rate": 1.0976095114032303e-05, + "loss": 0.853, + "step": 8632 + }, + { + "epoch": 3.913417951042611, + "grad_norm": 0.14249747916251446, + "learning_rate": 1.0967385817700711e-05, + "loss": 0.8763, + "step": 8633 + }, + { + "epoch": 3.913871260199456, + "grad_norm": 0.12344215618273162, + "learning_rate": 1.0958679429070762e-05, + "loss": 0.8679, + "step": 8634 + }, + { + "epoch": 3.914324569356301, + "grad_norm": 0.1372252262918707, + "learning_rate": 1.0949975949014418e-05, + "loss": 0.8611, + "step": 8635 + }, + { + "epoch": 3.9147778785131457, + "grad_norm": 0.1417135443530343, + "learning_rate": 1.094127537840335e-05, + "loss": 0.8303, + "step": 8636 + }, + { + "epoch": 3.915231187669991, + "grad_norm": 0.13558454926719388, + "learning_rate": 1.0932577718108966e-05, + "loss": 0.8557, + "step": 8637 + }, + { + "epoch": 3.915684496826836, + "grad_norm": 0.14016416630606435, + "learning_rate": 1.0923882969002335e-05, + "loss": 0.8604, + "step": 8638 + }, + { + "epoch": 3.9161378059836807, + "grad_norm": 0.12780154964426704, + "learning_rate": 1.0915191131954295e-05, + "loss": 0.8784, + "step": 8639 + }, + { + "epoch": 3.916591115140526, + "grad_norm": 0.148898042467384, + "learning_rate": 1.090650220783534e-05, + "loss": 0.8588, + "step": 8640 + }, + { + "epoch": 3.9170444242973708, + "grad_norm": 0.13703745035842052, + "learning_rate": 1.0897816197515691e-05, + "loss": 0.8652, + "step": 8641 + }, + { + "epoch": 3.9174977334542156, + "grad_norm": 0.1642774925598848, + "learning_rate": 1.088913310186527e-05, + "loss": 0.8594, + "step": 8642 + }, + { + "epoch": 3.917951042611061, + "grad_norm": 0.1254065323805134, + "learning_rate": 1.0880452921753739e-05, + "loss": 0.8695, + "step": 8643 + }, + { + "epoch": 3.9184043517679057, + "grad_norm": 0.15458498689104438, + "learning_rate": 1.0871775658050411e-05, + "loss": 0.885, + "step": 8644 + }, + { + "epoch": 3.9188576609247505, + "grad_norm": 0.1250843854478672, + "learning_rate": 1.086310131162437e-05, + "loss": 0.8729, + "step": 8645 + }, + { + "epoch": 3.919310970081596, + "grad_norm": 0.1389913769821898, + "learning_rate": 1.0854429883344357e-05, + "loss": 0.8691, + "step": 8646 + }, + { + "epoch": 3.9197642792384406, + "grad_norm": 0.13657579739554074, + "learning_rate": 1.0845761374078857e-05, + "loss": 0.8532, + "step": 8647 + }, + { + "epoch": 3.9202175883952854, + "grad_norm": 0.1517831485604416, + "learning_rate": 1.0837095784696028e-05, + "loss": 0.8484, + "step": 8648 + }, + { + "epoch": 3.9206708975521307, + "grad_norm": 0.133266986627788, + "learning_rate": 1.0828433116063773e-05, + "loss": 0.8829, + "step": 8649 + }, + { + "epoch": 3.9211242067089755, + "grad_norm": 0.13444205171751292, + "learning_rate": 1.0819773369049664e-05, + "loss": 0.8552, + "step": 8650 + }, + { + "epoch": 3.9215775158658204, + "grad_norm": 0.19173438903654452, + "learning_rate": 1.0811116544521028e-05, + "loss": 0.8747, + "step": 8651 + }, + { + "epoch": 3.9220308250226656, + "grad_norm": 0.14227121214647617, + "learning_rate": 1.0802462643344849e-05, + "loss": 0.8609, + "step": 8652 + }, + { + "epoch": 3.9224841341795105, + "grad_norm": 0.14101021828704413, + "learning_rate": 1.0793811666387835e-05, + "loss": 0.8912, + "step": 8653 + }, + { + "epoch": 3.9229374433363553, + "grad_norm": 0.1291497252492836, + "learning_rate": 1.078516361451643e-05, + "loss": 0.8755, + "step": 8654 + }, + { + "epoch": 3.9233907524932006, + "grad_norm": 0.12598469168893422, + "learning_rate": 1.0776518488596741e-05, + "loss": 0.8617, + "step": 8655 + }, + { + "epoch": 3.9238440616500454, + "grad_norm": 0.14470643854494394, + "learning_rate": 1.076787628949462e-05, + "loss": 0.8757, + "step": 8656 + }, + { + "epoch": 3.9242973708068902, + "grad_norm": 0.14553249991260658, + "learning_rate": 1.0759237018075593e-05, + "loss": 0.8797, + "step": 8657 + }, + { + "epoch": 3.9247506799637355, + "grad_norm": 0.12715495325463602, + "learning_rate": 1.0750600675204921e-05, + "loss": 0.8534, + "step": 8658 + }, + { + "epoch": 3.9252039891205803, + "grad_norm": 0.155330321021393, + "learning_rate": 1.0741967261747556e-05, + "loss": 0.8734, + "step": 8659 + }, + { + "epoch": 3.925657298277425, + "grad_norm": 0.1344940385263514, + "learning_rate": 1.073333677856816e-05, + "loss": 0.8669, + "step": 8660 + }, + { + "epoch": 3.9261106074342704, + "grad_norm": 0.13408304659875767, + "learning_rate": 1.0724709226531086e-05, + "loss": 0.8557, + "step": 8661 + }, + { + "epoch": 3.9265639165911153, + "grad_norm": 0.14071547662531197, + "learning_rate": 1.0716084606500434e-05, + "loss": 0.8651, + "step": 8662 + }, + { + "epoch": 3.92701722574796, + "grad_norm": 0.15367516628333575, + "learning_rate": 1.0707462919339959e-05, + "loss": 0.8503, + "step": 8663 + }, + { + "epoch": 3.927470534904805, + "grad_norm": 0.1323098942463158, + "learning_rate": 1.0698844165913172e-05, + "loss": 0.8622, + "step": 8664 + }, + { + "epoch": 3.92792384406165, + "grad_norm": 0.1307789259709397, + "learning_rate": 1.0690228347083247e-05, + "loss": 0.874, + "step": 8665 + }, + { + "epoch": 3.928377153218495, + "grad_norm": 0.14587646530673706, + "learning_rate": 1.0681615463713104e-05, + "loss": 0.8615, + "step": 8666 + }, + { + "epoch": 3.92883046237534, + "grad_norm": 0.1542854906007598, + "learning_rate": 1.0673005516665339e-05, + "loss": 0.8721, + "step": 8667 + }, + { + "epoch": 3.929283771532185, + "grad_norm": 0.13310115702877678, + "learning_rate": 1.0664398506802249e-05, + "loss": 0.891, + "step": 8668 + }, + { + "epoch": 3.92973708068903, + "grad_norm": 0.14761024488803462, + "learning_rate": 1.0655794434985873e-05, + "loss": 0.8656, + "step": 8669 + }, + { + "epoch": 3.9301903898458748, + "grad_norm": 0.15589432460314678, + "learning_rate": 1.064719330207792e-05, + "loss": 0.8498, + "step": 8670 + }, + { + "epoch": 3.9306436990027196, + "grad_norm": 0.16893228912911887, + "learning_rate": 1.0638595108939831e-05, + "loss": 0.8614, + "step": 8671 + }, + { + "epoch": 3.931097008159565, + "grad_norm": 0.14384034116486916, + "learning_rate": 1.0629999856432725e-05, + "loss": 0.866, + "step": 8672 + }, + { + "epoch": 3.9315503173164097, + "grad_norm": 0.12930259721262366, + "learning_rate": 1.062140754541746e-05, + "loss": 0.8841, + "step": 8673 + }, + { + "epoch": 3.9320036264732545, + "grad_norm": 0.14290958024161035, + "learning_rate": 1.0612818176754557e-05, + "loss": 0.8675, + "step": 8674 + }, + { + "epoch": 3.9324569356301, + "grad_norm": 0.1474990706323588, + "learning_rate": 1.0604231751304295e-05, + "loss": 0.8681, + "step": 8675 + }, + { + "epoch": 3.9329102447869446, + "grad_norm": 0.1439151493677013, + "learning_rate": 1.0595648269926607e-05, + "loss": 0.8508, + "step": 8676 + }, + { + "epoch": 3.9333635539437894, + "grad_norm": 0.1294598773484825, + "learning_rate": 1.0587067733481171e-05, + "loss": 0.8509, + "step": 8677 + }, + { + "epoch": 3.9338168631006347, + "grad_norm": 0.1599279286799101, + "learning_rate": 1.0578490142827342e-05, + "loss": 0.902, + "step": 8678 + }, + { + "epoch": 3.9342701722574795, + "grad_norm": 0.13184163411544533, + "learning_rate": 1.0569915498824197e-05, + "loss": 0.8556, + "step": 8679 + }, + { + "epoch": 3.9347234814143244, + "grad_norm": 0.14215198253059488, + "learning_rate": 1.0561343802330496e-05, + "loss": 0.8702, + "step": 8680 + }, + { + "epoch": 3.9351767905711696, + "grad_norm": 0.13617756071087272, + "learning_rate": 1.0552775054204742e-05, + "loss": 0.8532, + "step": 8681 + }, + { + "epoch": 3.9356300997280145, + "grad_norm": 0.1359840663647679, + "learning_rate": 1.0544209255305113e-05, + "loss": 0.8533, + "step": 8682 + }, + { + "epoch": 3.9360834088848593, + "grad_norm": 0.13003172457941276, + "learning_rate": 1.0535646406489484e-05, + "loss": 0.862, + "step": 8683 + }, + { + "epoch": 3.9365367180417046, + "grad_norm": 0.12515953235363889, + "learning_rate": 1.0527086508615475e-05, + "loss": 0.8789, + "step": 8684 + }, + { + "epoch": 3.9369900271985494, + "grad_norm": 0.15020916455872405, + "learning_rate": 1.0518529562540354e-05, + "loss": 0.8663, + "step": 8685 + }, + { + "epoch": 3.9374433363553942, + "grad_norm": 0.12878626512028118, + "learning_rate": 1.0509975569121159e-05, + "loss": 0.8527, + "step": 8686 + }, + { + "epoch": 3.9378966455122395, + "grad_norm": 0.1572511398541704, + "learning_rate": 1.050142452921456e-05, + "loss": 0.8695, + "step": 8687 + }, + { + "epoch": 3.9383499546690843, + "grad_norm": 0.12846463129263733, + "learning_rate": 1.0492876443677012e-05, + "loss": 0.8457, + "step": 8688 + }, + { + "epoch": 3.938803263825929, + "grad_norm": 0.15437181756221113, + "learning_rate": 1.048433131336459e-05, + "loss": 0.8569, + "step": 8689 + }, + { + "epoch": 3.9392565729827744, + "grad_norm": 0.14318743168298415, + "learning_rate": 1.0475789139133141e-05, + "loss": 0.8623, + "step": 8690 + }, + { + "epoch": 3.9397098821396193, + "grad_norm": 0.13897463282954756, + "learning_rate": 1.0467249921838172e-05, + "loss": 0.8599, + "step": 8691 + }, + { + "epoch": 3.940163191296464, + "grad_norm": 0.15535233352196892, + "learning_rate": 1.0458713662334934e-05, + "loss": 0.8599, + "step": 8692 + }, + { + "epoch": 3.9406165004533094, + "grad_norm": 0.1454130341312731, + "learning_rate": 1.0450180361478334e-05, + "loss": 0.8407, + "step": 8693 + }, + { + "epoch": 3.941069809610154, + "grad_norm": 0.1389864460255382, + "learning_rate": 1.0441650020123011e-05, + "loss": 0.851, + "step": 8694 + }, + { + "epoch": 3.941523118766999, + "grad_norm": 0.16328376977713058, + "learning_rate": 1.0433122639123318e-05, + "loss": 0.8628, + "step": 8695 + }, + { + "epoch": 3.9419764279238443, + "grad_norm": 0.12142459268250488, + "learning_rate": 1.0424598219333278e-05, + "loss": 0.8752, + "step": 8696 + }, + { + "epoch": 3.942429737080689, + "grad_norm": 0.13394282599323784, + "learning_rate": 1.0416076761606657e-05, + "loss": 0.8566, + "step": 8697 + }, + { + "epoch": 3.942883046237534, + "grad_norm": 0.1342314283046109, + "learning_rate": 1.0407558266796896e-05, + "loss": 0.862, + "step": 8698 + }, + { + "epoch": 3.943336355394379, + "grad_norm": 0.13547528162100017, + "learning_rate": 1.0399042735757146e-05, + "loss": 0.8641, + "step": 8699 + }, + { + "epoch": 3.943789664551224, + "grad_norm": 0.14542393021499944, + "learning_rate": 1.0390530169340245e-05, + "loss": 0.8489, + "step": 8700 + }, + { + "epoch": 3.944242973708069, + "grad_norm": 0.13505454446109422, + "learning_rate": 1.0382020568398782e-05, + "loss": 0.8694, + "step": 8701 + }, + { + "epoch": 3.944696282864914, + "grad_norm": 0.13300258717877939, + "learning_rate": 1.0373513933784998e-05, + "loss": 0.8682, + "step": 8702 + }, + { + "epoch": 3.945149592021759, + "grad_norm": 0.14689992753760656, + "learning_rate": 1.036501026635087e-05, + "loss": 0.8773, + "step": 8703 + }, + { + "epoch": 3.945602901178604, + "grad_norm": 0.13038122196512603, + "learning_rate": 1.0356509566948047e-05, + "loss": 0.8851, + "step": 8704 + }, + { + "epoch": 3.9460562103354486, + "grad_norm": 0.14758573940529282, + "learning_rate": 1.0348011836427925e-05, + "loss": 0.8791, + "step": 8705 + }, + { + "epoch": 3.946509519492294, + "grad_norm": 0.12817005676791982, + "learning_rate": 1.0339517075641555e-05, + "loss": 0.8562, + "step": 8706 + }, + { + "epoch": 3.9469628286491387, + "grad_norm": 0.13661525231702767, + "learning_rate": 1.0331025285439727e-05, + "loss": 0.8268, + "step": 8707 + }, + { + "epoch": 3.9474161378059835, + "grad_norm": 0.13890764562921967, + "learning_rate": 1.0322536466672917e-05, + "loss": 0.8611, + "step": 8708 + }, + { + "epoch": 3.9478694469628284, + "grad_norm": 0.1556129816502613, + "learning_rate": 1.0314050620191285e-05, + "loss": 0.8669, + "step": 8709 + }, + { + "epoch": 3.9483227561196736, + "grad_norm": 0.11455118278553558, + "learning_rate": 1.0305567746844738e-05, + "loss": 0.8875, + "step": 8710 + }, + { + "epoch": 3.9487760652765185, + "grad_norm": 0.16838250804798463, + "learning_rate": 1.0297087847482845e-05, + "loss": 0.8628, + "step": 8711 + }, + { + "epoch": 3.9492293744333633, + "grad_norm": 0.15098269169459483, + "learning_rate": 1.028861092295491e-05, + "loss": 0.8482, + "step": 8712 + }, + { + "epoch": 3.9496826835902086, + "grad_norm": 0.16148961520565283, + "learning_rate": 1.0280136974109895e-05, + "loss": 0.8671, + "step": 8713 + }, + { + "epoch": 3.9501359927470534, + "grad_norm": 0.15526147367128929, + "learning_rate": 1.0271666001796525e-05, + "loss": 0.8741, + "step": 8714 + }, + { + "epoch": 3.9505893019038982, + "grad_norm": 0.17896970549743513, + "learning_rate": 1.026319800686316e-05, + "loss": 0.8534, + "step": 8715 + }, + { + "epoch": 3.9510426110607435, + "grad_norm": 0.13656266603278266, + "learning_rate": 1.025473299015792e-05, + "loss": 0.8597, + "step": 8716 + }, + { + "epoch": 3.9514959202175883, + "grad_norm": 0.13179814704057755, + "learning_rate": 1.0246270952528597e-05, + "loss": 0.8534, + "step": 8717 + }, + { + "epoch": 3.951949229374433, + "grad_norm": 0.14802106084015515, + "learning_rate": 1.0237811894822677e-05, + "loss": 0.8747, + "step": 8718 + }, + { + "epoch": 3.9524025385312784, + "grad_norm": 0.15639307932840363, + "learning_rate": 1.0229355817887354e-05, + "loss": 0.8806, + "step": 8719 + }, + { + "epoch": 3.9528558476881233, + "grad_norm": 0.16085758490072277, + "learning_rate": 1.0220902722569553e-05, + "loss": 0.8591, + "step": 8720 + }, + { + "epoch": 3.953309156844968, + "grad_norm": 0.14307355714463715, + "learning_rate": 1.0212452609715853e-05, + "loss": 0.8613, + "step": 8721 + }, + { + "epoch": 3.9537624660018134, + "grad_norm": 0.12845279847114097, + "learning_rate": 1.0204005480172574e-05, + "loss": 0.8623, + "step": 8722 + }, + { + "epoch": 3.954215775158658, + "grad_norm": 0.13696584513438725, + "learning_rate": 1.019556133478572e-05, + "loss": 0.871, + "step": 8723 + }, + { + "epoch": 3.954669084315503, + "grad_norm": 0.12443051395969457, + "learning_rate": 1.0187120174400981e-05, + "loss": 0.8758, + "step": 8724 + }, + { + "epoch": 3.9551223934723483, + "grad_norm": 0.12891759961361418, + "learning_rate": 1.0178681999863782e-05, + "loss": 0.8821, + "step": 8725 + }, + { + "epoch": 3.955575702629193, + "grad_norm": 0.20337707644334965, + "learning_rate": 1.0170246812019218e-05, + "loss": 0.8811, + "step": 8726 + }, + { + "epoch": 3.956029011786038, + "grad_norm": 0.14760116803243187, + "learning_rate": 1.016181461171211e-05, + "loss": 0.8861, + "step": 8727 + }, + { + "epoch": 3.956482320942883, + "grad_norm": 0.12592884439489757, + "learning_rate": 1.0153385399786955e-05, + "loss": 0.8497, + "step": 8728 + }, + { + "epoch": 3.956935630099728, + "grad_norm": 0.1621922823306846, + "learning_rate": 1.014495917708798e-05, + "loss": 0.8653, + "step": 8729 + }, + { + "epoch": 3.957388939256573, + "grad_norm": 0.12249225312709475, + "learning_rate": 1.013653594445907e-05, + "loss": 0.8689, + "step": 8730 + }, + { + "epoch": 3.957842248413418, + "grad_norm": 0.15706399055852563, + "learning_rate": 1.0128115702743871e-05, + "loss": 0.8569, + "step": 8731 + }, + { + "epoch": 3.958295557570263, + "grad_norm": 0.12219492458239548, + "learning_rate": 1.011969845278566e-05, + "loss": 0.864, + "step": 8732 + }, + { + "epoch": 3.958748866727108, + "grad_norm": 0.14727551523476115, + "learning_rate": 1.0111284195427476e-05, + "loss": 0.8788, + "step": 8733 + }, + { + "epoch": 3.959202175883953, + "grad_norm": 0.12479154331817577, + "learning_rate": 1.0102872931512025e-05, + "loss": 0.8512, + "step": 8734 + }, + { + "epoch": 3.959655485040798, + "grad_norm": 0.14690136396275896, + "learning_rate": 1.00944646618817e-05, + "loss": 0.8628, + "step": 8735 + }, + { + "epoch": 3.9601087941976427, + "grad_norm": 0.12244393377163548, + "learning_rate": 1.0086059387378642e-05, + "loss": 0.8664, + "step": 8736 + }, + { + "epoch": 3.960562103354488, + "grad_norm": 0.14554250199608673, + "learning_rate": 1.0077657108844656e-05, + "loss": 0.8895, + "step": 8737 + }, + { + "epoch": 3.961015412511333, + "grad_norm": 0.12318832584951761, + "learning_rate": 1.0069257827121248e-05, + "loss": 0.8652, + "step": 8738 + }, + { + "epoch": 3.9614687216681777, + "grad_norm": 0.1214618462994439, + "learning_rate": 1.006086154304962e-05, + "loss": 0.8724, + "step": 8739 + }, + { + "epoch": 3.961922030825023, + "grad_norm": 0.1394181406387694, + "learning_rate": 1.0052468257470713e-05, + "loss": 0.8623, + "step": 8740 + }, + { + "epoch": 3.9623753399818678, + "grad_norm": 0.12284605802002295, + "learning_rate": 1.0044077971225112e-05, + "loss": 0.88, + "step": 8741 + }, + { + "epoch": 3.9628286491387126, + "grad_norm": 0.1414362376674521, + "learning_rate": 1.003569068515315e-05, + "loss": 0.8543, + "step": 8742 + }, + { + "epoch": 3.9632819582955574, + "grad_norm": 0.12566211476378739, + "learning_rate": 1.0027306400094821e-05, + "loss": 0.8661, + "step": 8743 + }, + { + "epoch": 3.9637352674524027, + "grad_norm": 0.1593442418808219, + "learning_rate": 1.0018925116889852e-05, + "loss": 0.8752, + "step": 8744 + }, + { + "epoch": 3.9641885766092475, + "grad_norm": 0.13622306590124253, + "learning_rate": 1.0010546836377636e-05, + "loss": 0.8791, + "step": 8745 + }, + { + "epoch": 3.9646418857660923, + "grad_norm": 0.1301578646959358, + "learning_rate": 1.0002171559397307e-05, + "loss": 0.8561, + "step": 8746 + }, + { + "epoch": 3.9650951949229376, + "grad_norm": 0.15281167935660822, + "learning_rate": 9.993799286787639e-06, + "loss": 0.8694, + "step": 8747 + }, + { + "epoch": 3.9655485040797824, + "grad_norm": 0.13573213649975807, + "learning_rate": 9.985430019387174e-06, + "loss": 0.8562, + "step": 8748 + }, + { + "epoch": 3.9660018132366273, + "grad_norm": 0.14819383167710523, + "learning_rate": 9.977063758034106e-06, + "loss": 0.8628, + "step": 8749 + }, + { + "epoch": 3.966455122393472, + "grad_norm": 0.1594148224959517, + "learning_rate": 9.968700503566327e-06, + "loss": 0.8593, + "step": 8750 + }, + { + "epoch": 3.9669084315503174, + "grad_norm": 0.1447243854667321, + "learning_rate": 9.960340256821462e-06, + "loss": 0.8569, + "step": 8751 + }, + { + "epoch": 3.967361740707162, + "grad_norm": 0.14314007525307157, + "learning_rate": 9.951983018636792e-06, + "loss": 0.8502, + "step": 8752 + }, + { + "epoch": 3.967815049864007, + "grad_norm": 0.1469685565143469, + "learning_rate": 9.943628789849349e-06, + "loss": 0.8569, + "step": 8753 + }, + { + "epoch": 3.9682683590208523, + "grad_norm": 0.1332214090461139, + "learning_rate": 9.935277571295803e-06, + "loss": 0.843, + "step": 8754 + }, + { + "epoch": 3.968721668177697, + "grad_norm": 0.1423946915148195, + "learning_rate": 9.926929363812588e-06, + "loss": 0.8677, + "step": 8755 + }, + { + "epoch": 3.969174977334542, + "grad_norm": 0.13446406308073194, + "learning_rate": 9.91858416823576e-06, + "loss": 0.8928, + "step": 8756 + }, + { + "epoch": 3.969628286491387, + "grad_norm": 0.12555958765708736, + "learning_rate": 9.910241985401142e-06, + "loss": 0.8657, + "step": 8757 + }, + { + "epoch": 3.970081595648232, + "grad_norm": 0.13680919457835441, + "learning_rate": 9.901902816144213e-06, + "loss": 0.8857, + "step": 8758 + }, + { + "epoch": 3.970534904805077, + "grad_norm": 0.12469208801347693, + "learning_rate": 9.893566661300187e-06, + "loss": 0.8586, + "step": 8759 + }, + { + "epoch": 3.970988213961922, + "grad_norm": 0.14703146689088337, + "learning_rate": 9.885233521703928e-06, + "loss": 0.8816, + "step": 8760 + }, + { + "epoch": 3.971441523118767, + "grad_norm": 0.13448668012663342, + "learning_rate": 9.876903398190048e-06, + "loss": 0.856, + "step": 8761 + }, + { + "epoch": 3.971894832275612, + "grad_norm": 0.12891979688366104, + "learning_rate": 9.868576291592808e-06, + "loss": 0.8593, + "step": 8762 + }, + { + "epoch": 3.972348141432457, + "grad_norm": 0.15699976148039219, + "learning_rate": 9.860252202746223e-06, + "loss": 0.8644, + "step": 8763 + }, + { + "epoch": 3.972801450589302, + "grad_norm": 0.15639993622936055, + "learning_rate": 9.851931132483959e-06, + "loss": 0.8704, + "step": 8764 + }, + { + "epoch": 3.9732547597461467, + "grad_norm": 0.14846043684698285, + "learning_rate": 9.843613081639382e-06, + "loss": 0.8694, + "step": 8765 + }, + { + "epoch": 3.973708068902992, + "grad_norm": 0.15385297967183176, + "learning_rate": 9.835298051045599e-06, + "loss": 0.8696, + "step": 8766 + }, + { + "epoch": 3.974161378059837, + "grad_norm": 0.13808278221249565, + "learning_rate": 9.826986041535354e-06, + "loss": 0.8669, + "step": 8767 + }, + { + "epoch": 3.9746146872166817, + "grad_norm": 0.1594223912572024, + "learning_rate": 9.818677053941145e-06, + "loss": 0.8792, + "step": 8768 + }, + { + "epoch": 3.975067996373527, + "grad_norm": 0.14651345444536343, + "learning_rate": 9.810371089095119e-06, + "loss": 0.8645, + "step": 8769 + }, + { + "epoch": 3.9755213055303718, + "grad_norm": 0.13431190593036008, + "learning_rate": 9.802068147829167e-06, + "loss": 0.8738, + "step": 8770 + }, + { + "epoch": 3.9759746146872166, + "grad_norm": 0.1487998032605582, + "learning_rate": 9.793768230974824e-06, + "loss": 0.8648, + "step": 8771 + }, + { + "epoch": 3.976427923844062, + "grad_norm": 0.12009280448060282, + "learning_rate": 9.785471339363383e-06, + "loss": 0.8688, + "step": 8772 + }, + { + "epoch": 3.9768812330009067, + "grad_norm": 0.13519970908853912, + "learning_rate": 9.777177473825774e-06, + "loss": 0.8315, + "step": 8773 + }, + { + "epoch": 3.9773345421577515, + "grad_norm": 0.1352575240503575, + "learning_rate": 9.768886635192688e-06, + "loss": 0.8586, + "step": 8774 + }, + { + "epoch": 3.977787851314597, + "grad_norm": 0.11844467221876069, + "learning_rate": 9.760598824294428e-06, + "loss": 0.8815, + "step": 8775 + }, + { + "epoch": 3.9782411604714416, + "grad_norm": 0.14519696860338763, + "learning_rate": 9.752314041961082e-06, + "loss": 0.8483, + "step": 8776 + }, + { + "epoch": 3.9786944696282864, + "grad_norm": 0.12077496148309745, + "learning_rate": 9.744032289022364e-06, + "loss": 0.8626, + "step": 8777 + }, + { + "epoch": 3.9791477787851317, + "grad_norm": 0.13841421970460288, + "learning_rate": 9.73575356630775e-06, + "loss": 0.8453, + "step": 8778 + }, + { + "epoch": 3.9796010879419765, + "grad_norm": 0.12403084085973108, + "learning_rate": 9.727477874646359e-06, + "loss": 0.8578, + "step": 8779 + }, + { + "epoch": 3.9800543970988214, + "grad_norm": 0.1390862292086526, + "learning_rate": 9.719205214867018e-06, + "loss": 0.8712, + "step": 8780 + }, + { + "epoch": 3.9805077062556666, + "grad_norm": 0.12574370015874953, + "learning_rate": 9.710935587798276e-06, + "loss": 0.8716, + "step": 8781 + }, + { + "epoch": 3.9809610154125115, + "grad_norm": 0.13748370495007822, + "learning_rate": 9.702668994268345e-06, + "loss": 0.8752, + "step": 8782 + }, + { + "epoch": 3.9814143245693563, + "grad_norm": 0.12301394417653655, + "learning_rate": 9.694405435105167e-06, + "loss": 0.865, + "step": 8783 + }, + { + "epoch": 3.981867633726201, + "grad_norm": 0.1449191580911615, + "learning_rate": 9.686144911136339e-06, + "loss": 0.8729, + "step": 8784 + }, + { + "epoch": 3.9823209428830464, + "grad_norm": 0.12208999601663287, + "learning_rate": 9.677887423189203e-06, + "loss": 0.8607, + "step": 8785 + }, + { + "epoch": 3.982774252039891, + "grad_norm": 0.15551233110481227, + "learning_rate": 9.66963297209074e-06, + "loss": 0.8602, + "step": 8786 + }, + { + "epoch": 3.983227561196736, + "grad_norm": 0.1253111343497961, + "learning_rate": 9.661381558667692e-06, + "loss": 0.8623, + "step": 8787 + }, + { + "epoch": 3.983680870353581, + "grad_norm": 0.14013795012617744, + "learning_rate": 9.653133183746428e-06, + "loss": 0.892, + "step": 8788 + }, + { + "epoch": 3.984134179510426, + "grad_norm": 0.12166764019321923, + "learning_rate": 9.644887848153076e-06, + "loss": 0.8702, + "step": 8789 + }, + { + "epoch": 3.984587488667271, + "grad_norm": 0.13560915871144025, + "learning_rate": 9.63664555271342e-06, + "loss": 0.8628, + "step": 8790 + }, + { + "epoch": 3.985040797824116, + "grad_norm": 0.14467449240094368, + "learning_rate": 9.628406298252937e-06, + "loss": 0.8522, + "step": 8791 + }, + { + "epoch": 3.985494106980961, + "grad_norm": 0.12268861835134415, + "learning_rate": 9.620170085596831e-06, + "loss": 0.8649, + "step": 8792 + }, + { + "epoch": 3.985947416137806, + "grad_norm": 0.14551154141018466, + "learning_rate": 9.611936915569968e-06, + "loss": 0.8591, + "step": 8793 + }, + { + "epoch": 3.9864007252946507, + "grad_norm": 0.12242938013646518, + "learning_rate": 9.603706788996954e-06, + "loss": 0.853, + "step": 8794 + }, + { + "epoch": 3.986854034451496, + "grad_norm": 0.14976913767006014, + "learning_rate": 9.595479706702013e-06, + "loss": 0.8563, + "step": 8795 + }, + { + "epoch": 3.987307343608341, + "grad_norm": 0.1310257564458422, + "learning_rate": 9.58725566950915e-06, + "loss": 0.8499, + "step": 8796 + }, + { + "epoch": 3.9877606527651857, + "grad_norm": 0.1422231950383188, + "learning_rate": 9.579034678242002e-06, + "loss": 0.8782, + "step": 8797 + }, + { + "epoch": 3.988213961922031, + "grad_norm": 0.11942374873411316, + "learning_rate": 9.57081673372395e-06, + "loss": 0.8643, + "step": 8798 + }, + { + "epoch": 3.9886672710788758, + "grad_norm": 0.15830163565516286, + "learning_rate": 9.562601836778018e-06, + "loss": 0.8791, + "step": 8799 + }, + { + "epoch": 3.9891205802357206, + "grad_norm": 0.1460766961024829, + "learning_rate": 9.554389988226976e-06, + "loss": 0.8682, + "step": 8800 + }, + { + "epoch": 3.989573889392566, + "grad_norm": 0.16296084568958658, + "learning_rate": 9.546181188893241e-06, + "loss": 0.8567, + "step": 8801 + }, + { + "epoch": 3.9900271985494107, + "grad_norm": 0.16107993762741035, + "learning_rate": 9.537975439598979e-06, + "loss": 0.8683, + "step": 8802 + }, + { + "epoch": 3.9904805077062555, + "grad_norm": 0.17869675643009536, + "learning_rate": 9.52977274116599e-06, + "loss": 0.8668, + "step": 8803 + }, + { + "epoch": 3.990933816863101, + "grad_norm": 0.1386900530287897, + "learning_rate": 9.521573094415823e-06, + "loss": 0.8765, + "step": 8804 + }, + { + "epoch": 3.9913871260199456, + "grad_norm": 0.17346408926866413, + "learning_rate": 9.513376500169685e-06, + "loss": 0.8661, + "step": 8805 + }, + { + "epoch": 3.9918404351767904, + "grad_norm": 0.13389441947341887, + "learning_rate": 9.505182959248476e-06, + "loss": 0.8918, + "step": 8806 + }, + { + "epoch": 3.9922937443336357, + "grad_norm": 0.15733019267051762, + "learning_rate": 9.496992472472825e-06, + "loss": 0.8669, + "step": 8807 + }, + { + "epoch": 3.9927470534904805, + "grad_norm": 0.13790454208878558, + "learning_rate": 9.488805040663021e-06, + "loss": 0.8649, + "step": 8808 + }, + { + "epoch": 3.9932003626473254, + "grad_norm": 0.13256930765972733, + "learning_rate": 9.480620664639066e-06, + "loss": 0.8874, + "step": 8809 + }, + { + "epoch": 3.9936536718041706, + "grad_norm": 0.12424555440560578, + "learning_rate": 9.47243934522064e-06, + "loss": 0.8576, + "step": 8810 + }, + { + "epoch": 3.9941069809610155, + "grad_norm": 0.14632271077373574, + "learning_rate": 9.464261083227142e-06, + "loss": 0.8708, + "step": 8811 + }, + { + "epoch": 3.9945602901178603, + "grad_norm": 0.1277909972774532, + "learning_rate": 9.45608587947763e-06, + "loss": 0.8665, + "step": 8812 + }, + { + "epoch": 3.9950135992747056, + "grad_norm": 0.12606112602982397, + "learning_rate": 9.447913734790908e-06, + "loss": 0.8727, + "step": 8813 + }, + { + "epoch": 3.9954669084315504, + "grad_norm": 0.12478148091835455, + "learning_rate": 9.439744649985387e-06, + "loss": 0.8505, + "step": 8814 + }, + { + "epoch": 3.995920217588395, + "grad_norm": 0.12811540968803242, + "learning_rate": 9.431578625879271e-06, + "loss": 0.8376, + "step": 8815 + }, + { + "epoch": 3.9963735267452405, + "grad_norm": 0.13274959257982602, + "learning_rate": 9.423415663290382e-06, + "loss": 0.8676, + "step": 8816 + }, + { + "epoch": 3.9968268359020853, + "grad_norm": 0.13633294419038106, + "learning_rate": 9.415255763036288e-06, + "loss": 0.8644, + "step": 8817 + }, + { + "epoch": 3.99728014505893, + "grad_norm": 0.1377666279609311, + "learning_rate": 9.407098925934207e-06, + "loss": 0.883, + "step": 8818 + }, + { + "epoch": 3.9977334542157754, + "grad_norm": 0.14993865833086217, + "learning_rate": 9.398945152801087e-06, + "loss": 0.8511, + "step": 8819 + }, + { + "epoch": 3.9981867633726202, + "grad_norm": 0.1373170682309801, + "learning_rate": 9.390794444453548e-06, + "loss": 0.8765, + "step": 8820 + }, + { + "epoch": 3.998640072529465, + "grad_norm": 0.13858182102999042, + "learning_rate": 9.382646801707894e-06, + "loss": 0.8724, + "step": 8821 + }, + { + "epoch": 3.99909338168631, + "grad_norm": 0.1334587026039683, + "learning_rate": 9.374502225380153e-06, + "loss": 0.8455, + "step": 8822 + }, + { + "epoch": 3.999546690843155, + "grad_norm": 0.14450740152100897, + "learning_rate": 9.36636071628601e-06, + "loss": 0.8427, + "step": 8823 + }, + { + "epoch": 4.0, + "grad_norm": 0.1444769668157407, + "learning_rate": 9.358222275240884e-06, + "loss": 0.8534, + "step": 8824 + }, + { + "epoch": 4.000453309156845, + "grad_norm": 0.14835306146966498, + "learning_rate": 9.350086903059842e-06, + "loss": 0.8501, + "step": 8825 + }, + { + "epoch": 4.00090661831369, + "grad_norm": 0.12603597586962875, + "learning_rate": 9.341954600557685e-06, + "loss": 0.8683, + "step": 8826 + }, + { + "epoch": 4.001359927470535, + "grad_norm": 0.13612796106870126, + "learning_rate": 9.333825368548864e-06, + "loss": 0.842, + "step": 8827 + }, + { + "epoch": 4.00181323662738, + "grad_norm": 0.14225838187295856, + "learning_rate": 9.325699207847574e-06, + "loss": 0.8601, + "step": 8828 + }, + { + "epoch": 4.002266545784225, + "grad_norm": 0.12723560694376307, + "learning_rate": 9.317576119267641e-06, + "loss": 0.8603, + "step": 8829 + }, + { + "epoch": 4.00271985494107, + "grad_norm": 0.14860903945598722, + "learning_rate": 9.309456103622652e-06, + "loss": 0.8508, + "step": 8830 + }, + { + "epoch": 4.003173164097915, + "grad_norm": 0.15037494703511728, + "learning_rate": 9.301339161725829e-06, + "loss": 0.858, + "step": 8831 + }, + { + "epoch": 4.0036264732547595, + "grad_norm": 0.16753654365818102, + "learning_rate": 9.293225294390109e-06, + "loss": 0.8639, + "step": 8832 + }, + { + "epoch": 4.004079782411605, + "grad_norm": 0.15308467130164072, + "learning_rate": 9.28511450242811e-06, + "loss": 0.8442, + "step": 8833 + }, + { + "epoch": 4.00453309156845, + "grad_norm": 0.1922809960915847, + "learning_rate": 9.277006786652176e-06, + "loss": 0.8599, + "step": 8834 + }, + { + "epoch": 4.004986400725294, + "grad_norm": 0.1458300302271452, + "learning_rate": 9.268902147874294e-06, + "loss": 0.8546, + "step": 8835 + }, + { + "epoch": 4.00543970988214, + "grad_norm": 0.16216920052121161, + "learning_rate": 9.26080058690619e-06, + "loss": 0.845, + "step": 8836 + }, + { + "epoch": 4.005893019038985, + "grad_norm": 0.13151035978915043, + "learning_rate": 9.25270210455925e-06, + "loss": 0.8548, + "step": 8837 + }, + { + "epoch": 4.006346328195829, + "grad_norm": 0.16271452712360024, + "learning_rate": 9.244606701644546e-06, + "loss": 0.8737, + "step": 8838 + }, + { + "epoch": 4.006799637352675, + "grad_norm": 0.16344788563548554, + "learning_rate": 9.236514378972878e-06, + "loss": 0.8511, + "step": 8839 + }, + { + "epoch": 4.00725294650952, + "grad_norm": 0.14363070358847707, + "learning_rate": 9.2284251373547e-06, + "loss": 0.8518, + "step": 8840 + }, + { + "epoch": 4.007706255666364, + "grad_norm": 0.14731087818039784, + "learning_rate": 9.220338977600187e-06, + "loss": 0.8783, + "step": 8841 + }, + { + "epoch": 4.00815956482321, + "grad_norm": 0.14369309598127084, + "learning_rate": 9.212255900519178e-06, + "loss": 0.8524, + "step": 8842 + }, + { + "epoch": 4.008612873980055, + "grad_norm": 0.16176657096323738, + "learning_rate": 9.204175906921233e-06, + "loss": 0.8469, + "step": 8843 + }, + { + "epoch": 4.009066183136899, + "grad_norm": 0.17142995296907243, + "learning_rate": 9.196098997615572e-06, + "loss": 0.8305, + "step": 8844 + }, + { + "epoch": 4.0095194922937445, + "grad_norm": 0.1696672857972172, + "learning_rate": 9.188025173411135e-06, + "loss": 0.852, + "step": 8845 + }, + { + "epoch": 4.009972801450589, + "grad_norm": 0.16031269288762137, + "learning_rate": 9.179954435116527e-06, + "loss": 0.8374, + "step": 8846 + }, + { + "epoch": 4.010426110607434, + "grad_norm": 0.1204832621668557, + "learning_rate": 9.171886783540058e-06, + "loss": 0.8478, + "step": 8847 + }, + { + "epoch": 4.010879419764279, + "grad_norm": 0.2093109427314353, + "learning_rate": 9.163822219489735e-06, + "loss": 0.8411, + "step": 8848 + }, + { + "epoch": 4.011332728921124, + "grad_norm": 0.1287304162518913, + "learning_rate": 9.155760743773233e-06, + "loss": 0.8682, + "step": 8849 + }, + { + "epoch": 4.011786038077969, + "grad_norm": 0.19128984720477232, + "learning_rate": 9.147702357197952e-06, + "loss": 0.8314, + "step": 8850 + }, + { + "epoch": 4.012239347234814, + "grad_norm": 0.13765777527312822, + "learning_rate": 9.139647060570955e-06, + "loss": 0.8545, + "step": 8851 + }, + { + "epoch": 4.012692656391659, + "grad_norm": 0.1559579712613772, + "learning_rate": 9.131594854699001e-06, + "loss": 0.8484, + "step": 8852 + }, + { + "epoch": 4.013145965548504, + "grad_norm": 0.14463300561176964, + "learning_rate": 9.123545740388536e-06, + "loss": 0.8636, + "step": 8853 + }, + { + "epoch": 4.013599274705349, + "grad_norm": 0.16378818429073316, + "learning_rate": 9.115499718445716e-06, + "loss": 0.8281, + "step": 8854 + }, + { + "epoch": 4.014052583862194, + "grad_norm": 0.15438631176455112, + "learning_rate": 9.107456789676354e-06, + "loss": 0.8706, + "step": 8855 + }, + { + "epoch": 4.014505893019039, + "grad_norm": 0.15940368501816382, + "learning_rate": 9.099416954886e-06, + "loss": 0.8381, + "step": 8856 + }, + { + "epoch": 4.014959202175884, + "grad_norm": 0.17878953710255382, + "learning_rate": 9.091380214879844e-06, + "loss": 0.864, + "step": 8857 + }, + { + "epoch": 4.015412511332729, + "grad_norm": 0.14693643919787422, + "learning_rate": 9.083346570462806e-06, + "loss": 0.8339, + "step": 8858 + }, + { + "epoch": 4.015865820489574, + "grad_norm": 0.16597828040746906, + "learning_rate": 9.07531602243946e-06, + "loss": 0.8753, + "step": 8859 + }, + { + "epoch": 4.016319129646419, + "grad_norm": 0.16509360563154962, + "learning_rate": 9.067288571614114e-06, + "loss": 0.8526, + "step": 8860 + }, + { + "epoch": 4.0167724388032635, + "grad_norm": 0.14018175577764044, + "learning_rate": 9.059264218790714e-06, + "loss": 0.8454, + "step": 8861 + }, + { + "epoch": 4.017225747960109, + "grad_norm": 0.1606425222763413, + "learning_rate": 9.051242964772947e-06, + "loss": 0.836, + "step": 8862 + }, + { + "epoch": 4.017679057116954, + "grad_norm": 0.13053678311910716, + "learning_rate": 9.043224810364153e-06, + "loss": 0.859, + "step": 8863 + }, + { + "epoch": 4.018132366273798, + "grad_norm": 0.1475357916086943, + "learning_rate": 9.035209756367366e-06, + "loss": 0.8633, + "step": 8864 + }, + { + "epoch": 4.018585675430644, + "grad_norm": 0.12710768689868787, + "learning_rate": 9.027197803585333e-06, + "loss": 0.852, + "step": 8865 + }, + { + "epoch": 4.019038984587489, + "grad_norm": 0.15142123416906478, + "learning_rate": 9.019188952820461e-06, + "loss": 0.8787, + "step": 8866 + }, + { + "epoch": 4.019492293744333, + "grad_norm": 0.13843594000520365, + "learning_rate": 9.011183204874876e-06, + "loss": 0.8588, + "step": 8867 + }, + { + "epoch": 4.019945602901179, + "grad_norm": 0.15005725833742067, + "learning_rate": 9.003180560550352e-06, + "loss": 0.8465, + "step": 8868 + }, + { + "epoch": 4.020398912058024, + "grad_norm": 0.14498263726807234, + "learning_rate": 8.995181020648406e-06, + "loss": 0.8345, + "step": 8869 + }, + { + "epoch": 4.020852221214868, + "grad_norm": 0.13340952930699834, + "learning_rate": 8.987184585970206e-06, + "loss": 0.8653, + "step": 8870 + }, + { + "epoch": 4.021305530371714, + "grad_norm": 0.14640479095234704, + "learning_rate": 8.979191257316611e-06, + "loss": 0.8455, + "step": 8871 + }, + { + "epoch": 4.021758839528559, + "grad_norm": 0.1281460983870801, + "learning_rate": 8.971201035488172e-06, + "loss": 0.86, + "step": 8872 + }, + { + "epoch": 4.022212148685403, + "grad_norm": 0.11916897253347933, + "learning_rate": 8.96321392128515e-06, + "loss": 0.8592, + "step": 8873 + }, + { + "epoch": 4.0226654578422485, + "grad_norm": 0.14919013644265333, + "learning_rate": 8.955229915507458e-06, + "loss": 0.8508, + "step": 8874 + }, + { + "epoch": 4.023118766999094, + "grad_norm": 0.12854263137211366, + "learning_rate": 8.94724901895474e-06, + "loss": 0.8712, + "step": 8875 + }, + { + "epoch": 4.023572076155938, + "grad_norm": 0.1303082330104462, + "learning_rate": 8.939271232426288e-06, + "loss": 0.8518, + "step": 8876 + }, + { + "epoch": 4.024025385312783, + "grad_norm": 0.12649985609962675, + "learning_rate": 8.931296556721115e-06, + "loss": 0.8485, + "step": 8877 + }, + { + "epoch": 4.024478694469629, + "grad_norm": 0.1410921108705833, + "learning_rate": 8.923324992637901e-06, + "loss": 0.873, + "step": 8878 + }, + { + "epoch": 4.024932003626473, + "grad_norm": 0.12451106327850135, + "learning_rate": 8.915356540975013e-06, + "loss": 0.8465, + "step": 8879 + }, + { + "epoch": 4.025385312783318, + "grad_norm": 0.11929303386390966, + "learning_rate": 8.907391202530534e-06, + "loss": 0.8739, + "step": 8880 + }, + { + "epoch": 4.025838621940164, + "grad_norm": 0.13585798456352316, + "learning_rate": 8.899428978102192e-06, + "loss": 0.8731, + "step": 8881 + }, + { + "epoch": 4.026291931097008, + "grad_norm": 0.11438944605343496, + "learning_rate": 8.891469868487453e-06, + "loss": 0.8613, + "step": 8882 + }, + { + "epoch": 4.026745240253853, + "grad_norm": 0.15037595927633368, + "learning_rate": 8.883513874483421e-06, + "loss": 0.8489, + "step": 8883 + }, + { + "epoch": 4.027198549410698, + "grad_norm": 0.12868719350987318, + "learning_rate": 8.875560996886934e-06, + "loss": 0.8703, + "step": 8884 + }, + { + "epoch": 4.027651858567543, + "grad_norm": 0.13790889966295172, + "learning_rate": 8.86761123649448e-06, + "loss": 0.855, + "step": 8885 + }, + { + "epoch": 4.028105167724388, + "grad_norm": 0.11717823924898468, + "learning_rate": 8.859664594102266e-06, + "loss": 0.847, + "step": 8886 + }, + { + "epoch": 4.028558476881233, + "grad_norm": 0.13773574606929975, + "learning_rate": 8.85172107050615e-06, + "loss": 0.8453, + "step": 8887 + }, + { + "epoch": 4.029011786038078, + "grad_norm": 0.11246905311859723, + "learning_rate": 8.843780666501724e-06, + "loss": 0.854, + "step": 8888 + }, + { + "epoch": 4.029465095194923, + "grad_norm": 0.12714039236400893, + "learning_rate": 8.83584338288423e-06, + "loss": 0.8702, + "step": 8889 + }, + { + "epoch": 4.0299184043517675, + "grad_norm": 0.12983882117854878, + "learning_rate": 8.827909220448614e-06, + "loss": 0.8537, + "step": 8890 + }, + { + "epoch": 4.030371713508613, + "grad_norm": 0.11042993342696389, + "learning_rate": 8.819978179989489e-06, + "loss": 0.857, + "step": 8891 + }, + { + "epoch": 4.030825022665458, + "grad_norm": 0.13606074856951272, + "learning_rate": 8.812050262301196e-06, + "loss": 0.832, + "step": 8892 + }, + { + "epoch": 4.031278331822302, + "grad_norm": 0.11695525970198914, + "learning_rate": 8.804125468177732e-06, + "loss": 0.8721, + "step": 8893 + }, + { + "epoch": 4.031731640979148, + "grad_norm": 0.14969737000245384, + "learning_rate": 8.796203798412769e-06, + "loss": 0.8581, + "step": 8894 + }, + { + "epoch": 4.032184950135993, + "grad_norm": 0.14749912310927354, + "learning_rate": 8.788285253799715e-06, + "loss": 0.8599, + "step": 8895 + }, + { + "epoch": 4.032638259292837, + "grad_norm": 0.14059766997136147, + "learning_rate": 8.780369835131614e-06, + "loss": 0.8429, + "step": 8896 + }, + { + "epoch": 4.033091568449683, + "grad_norm": 0.1622502914089052, + "learning_rate": 8.772457543201237e-06, + "loss": 0.858, + "step": 8897 + }, + { + "epoch": 4.033544877606528, + "grad_norm": 0.1179471286977934, + "learning_rate": 8.764548378800998e-06, + "loss": 0.852, + "step": 8898 + }, + { + "epoch": 4.033998186763372, + "grad_norm": 0.15065182964677723, + "learning_rate": 8.756642342723056e-06, + "loss": 0.8424, + "step": 8899 + }, + { + "epoch": 4.034451495920218, + "grad_norm": 0.14133594258824772, + "learning_rate": 8.748739435759188e-06, + "loss": 0.8405, + "step": 8900 + }, + { + "epoch": 4.034904805077063, + "grad_norm": 0.1357058644896233, + "learning_rate": 8.740839658700926e-06, + "loss": 0.8618, + "step": 8901 + }, + { + "epoch": 4.035358114233907, + "grad_norm": 0.17580099474672578, + "learning_rate": 8.732943012339432e-06, + "loss": 0.8578, + "step": 8902 + }, + { + "epoch": 4.0358114233907525, + "grad_norm": 0.1480181162118561, + "learning_rate": 8.725049497465593e-06, + "loss": 0.829, + "step": 8903 + }, + { + "epoch": 4.036264732547598, + "grad_norm": 0.16967161252506047, + "learning_rate": 8.717159114869966e-06, + "loss": 0.8318, + "step": 8904 + }, + { + "epoch": 4.036718041704442, + "grad_norm": 0.12758140426923775, + "learning_rate": 8.709271865342783e-06, + "loss": 0.8508, + "step": 8905 + }, + { + "epoch": 4.037171350861287, + "grad_norm": 0.1527136027778194, + "learning_rate": 8.701387749673995e-06, + "loss": 0.8515, + "step": 8906 + }, + { + "epoch": 4.037624660018133, + "grad_norm": 0.12942492460835975, + "learning_rate": 8.693506768653193e-06, + "loss": 0.8696, + "step": 8907 + }, + { + "epoch": 4.038077969174977, + "grad_norm": 0.14439617032723387, + "learning_rate": 8.685628923069713e-06, + "loss": 0.8665, + "step": 8908 + }, + { + "epoch": 4.038531278331822, + "grad_norm": 0.1453252400131079, + "learning_rate": 8.67775421371252e-06, + "loss": 0.8607, + "step": 8909 + }, + { + "epoch": 4.038984587488668, + "grad_norm": 0.12712663399204968, + "learning_rate": 8.669882641370306e-06, + "loss": 0.8486, + "step": 8910 + }, + { + "epoch": 4.039437896645512, + "grad_norm": 0.1685844196287884, + "learning_rate": 8.662014206831406e-06, + "loss": 0.8417, + "step": 8911 + }, + { + "epoch": 4.039891205802357, + "grad_norm": 0.12089877392329519, + "learning_rate": 8.654148910883897e-06, + "loss": 0.8719, + "step": 8912 + }, + { + "epoch": 4.0403445149592025, + "grad_norm": 0.1407580318423928, + "learning_rate": 8.646286754315486e-06, + "loss": 0.8646, + "step": 8913 + }, + { + "epoch": 4.040797824116047, + "grad_norm": 0.14422474361115017, + "learning_rate": 8.638427737913617e-06, + "loss": 0.8485, + "step": 8914 + }, + { + "epoch": 4.041251133272892, + "grad_norm": 0.12870081048315468, + "learning_rate": 8.630571862465365e-06, + "loss": 0.8399, + "step": 8915 + }, + { + "epoch": 4.0417044424297375, + "grad_norm": 0.17069394647921501, + "learning_rate": 8.622719128757548e-06, + "loss": 0.8332, + "step": 8916 + }, + { + "epoch": 4.042157751586582, + "grad_norm": 0.11838045625570642, + "learning_rate": 8.614869537576615e-06, + "loss": 0.8723, + "step": 8917 + }, + { + "epoch": 4.042611060743427, + "grad_norm": 0.13857987741366587, + "learning_rate": 8.607023089708746e-06, + "loss": 0.8526, + "step": 8918 + }, + { + "epoch": 4.043064369900272, + "grad_norm": 0.13166229715499453, + "learning_rate": 8.599179785939778e-06, + "loss": 0.8509, + "step": 8919 + }, + { + "epoch": 4.043517679057117, + "grad_norm": 0.12409665236857076, + "learning_rate": 8.591339627055228e-06, + "loss": 0.8485, + "step": 8920 + }, + { + "epoch": 4.043970988213962, + "grad_norm": 0.12264790907303436, + "learning_rate": 8.583502613840337e-06, + "loss": 0.8556, + "step": 8921 + }, + { + "epoch": 4.044424297370807, + "grad_norm": 0.13266873486416692, + "learning_rate": 8.575668747079979e-06, + "loss": 0.8599, + "step": 8922 + }, + { + "epoch": 4.044877606527652, + "grad_norm": 0.1396171476755367, + "learning_rate": 8.567838027558757e-06, + "loss": 0.8511, + "step": 8923 + }, + { + "epoch": 4.045330915684497, + "grad_norm": 0.13188078125116495, + "learning_rate": 8.560010456060928e-06, + "loss": 0.8351, + "step": 8924 + }, + { + "epoch": 4.045784224841341, + "grad_norm": 0.1353556320115364, + "learning_rate": 8.55218603337046e-06, + "loss": 0.8782, + "step": 8925 + }, + { + "epoch": 4.046237533998187, + "grad_norm": 0.1512107424481331, + "learning_rate": 8.544364760270975e-06, + "loss": 0.8668, + "step": 8926 + }, + { + "epoch": 4.046690843155032, + "grad_norm": 0.12447726531425725, + "learning_rate": 8.536546637545817e-06, + "loss": 0.866, + "step": 8927 + }, + { + "epoch": 4.047144152311876, + "grad_norm": 0.1483334553242719, + "learning_rate": 8.528731665977981e-06, + "loss": 0.8456, + "step": 8928 + }, + { + "epoch": 4.047597461468722, + "grad_norm": 0.12131721280498348, + "learning_rate": 8.52091984635016e-06, + "loss": 0.8609, + "step": 8929 + }, + { + "epoch": 4.048050770625567, + "grad_norm": 0.134748328349641, + "learning_rate": 8.513111179444724e-06, + "loss": 0.8461, + "step": 8930 + }, + { + "epoch": 4.048504079782411, + "grad_norm": 0.14873122398009447, + "learning_rate": 8.50530566604375e-06, + "loss": 0.8525, + "step": 8931 + }, + { + "epoch": 4.0489573889392565, + "grad_norm": 0.11465134319731685, + "learning_rate": 8.497503306928964e-06, + "loss": 0.8615, + "step": 8932 + }, + { + "epoch": 4.049410698096102, + "grad_norm": 0.16791276754561685, + "learning_rate": 8.489704102881816e-06, + "loss": 0.8562, + "step": 8933 + }, + { + "epoch": 4.049864007252946, + "grad_norm": 0.12412029258562343, + "learning_rate": 8.481908054683411e-06, + "loss": 0.8497, + "step": 8934 + }, + { + "epoch": 4.050317316409791, + "grad_norm": 0.14541365711653056, + "learning_rate": 8.474115163114534e-06, + "loss": 0.8312, + "step": 8935 + }, + { + "epoch": 4.050770625566637, + "grad_norm": 0.11532638731606416, + "learning_rate": 8.466325428955686e-06, + "loss": 0.866, + "step": 8936 + }, + { + "epoch": 4.051223934723481, + "grad_norm": 0.1326001194675848, + "learning_rate": 8.45853885298701e-06, + "loss": 0.8573, + "step": 8937 + }, + { + "epoch": 4.051677243880326, + "grad_norm": 0.14865238055358435, + "learning_rate": 8.45075543598838e-06, + "loss": 0.862, + "step": 8938 + }, + { + "epoch": 4.052130553037172, + "grad_norm": 0.1108741257309898, + "learning_rate": 8.4429751787393e-06, + "loss": 0.8512, + "step": 8939 + }, + { + "epoch": 4.052583862194016, + "grad_norm": 0.15561197699010706, + "learning_rate": 8.435198082019016e-06, + "loss": 0.8531, + "step": 8940 + }, + { + "epoch": 4.053037171350861, + "grad_norm": 0.11328742462487901, + "learning_rate": 8.4274241466064e-06, + "loss": 0.8771, + "step": 8941 + }, + { + "epoch": 4.0534904805077066, + "grad_norm": 0.12305460117501513, + "learning_rate": 8.419653373280052e-06, + "loss": 0.8558, + "step": 8942 + }, + { + "epoch": 4.053943789664551, + "grad_norm": 0.14980790093969337, + "learning_rate": 8.411885762818227e-06, + "loss": 0.8608, + "step": 8943 + }, + { + "epoch": 4.054397098821396, + "grad_norm": 0.12952210377674536, + "learning_rate": 8.40412131599888e-06, + "loss": 0.8685, + "step": 8944 + }, + { + "epoch": 4.0548504079782415, + "grad_norm": 0.14034324938322332, + "learning_rate": 8.39636003359965e-06, + "loss": 0.852, + "step": 8945 + }, + { + "epoch": 4.055303717135086, + "grad_norm": 0.13393424398400358, + "learning_rate": 8.388601916397831e-06, + "loss": 0.8478, + "step": 8946 + }, + { + "epoch": 4.055757026291931, + "grad_norm": 0.14639414632853673, + "learning_rate": 8.380846965170444e-06, + "loss": 0.8331, + "step": 8947 + }, + { + "epoch": 4.056210335448776, + "grad_norm": 0.12559595902204782, + "learning_rate": 8.373095180694162e-06, + "loss": 0.8475, + "step": 8948 + }, + { + "epoch": 4.056663644605621, + "grad_norm": 0.12798723900081044, + "learning_rate": 8.365346563745346e-06, + "loss": 0.8359, + "step": 8949 + }, + { + "epoch": 4.057116953762466, + "grad_norm": 0.12989384135936577, + "learning_rate": 8.357601115100036e-06, + "loss": 0.8587, + "step": 8950 + }, + { + "epoch": 4.057570262919311, + "grad_norm": 0.12126930162822108, + "learning_rate": 8.349858835533977e-06, + "loss": 0.8749, + "step": 8951 + }, + { + "epoch": 4.058023572076156, + "grad_norm": 0.12944384676263931, + "learning_rate": 8.342119725822564e-06, + "loss": 0.8679, + "step": 8952 + }, + { + "epoch": 4.058476881233001, + "grad_norm": 0.12473305968350616, + "learning_rate": 8.334383786740915e-06, + "loss": 0.861, + "step": 8953 + }, + { + "epoch": 4.058930190389846, + "grad_norm": 0.13043761346004243, + "learning_rate": 8.326651019063781e-06, + "loss": 0.8384, + "step": 8954 + }, + { + "epoch": 4.059383499546691, + "grad_norm": 0.16778272162239477, + "learning_rate": 8.318921423565642e-06, + "loss": 0.8513, + "step": 8955 + }, + { + "epoch": 4.059836808703536, + "grad_norm": 0.11228051391750292, + "learning_rate": 8.311195001020623e-06, + "loss": 0.8629, + "step": 8956 + }, + { + "epoch": 4.060290117860381, + "grad_norm": 0.12990528572474835, + "learning_rate": 8.303471752202564e-06, + "loss": 0.8546, + "step": 8957 + }, + { + "epoch": 4.060743427017226, + "grad_norm": 0.1297179374421311, + "learning_rate": 8.295751677884954e-06, + "loss": 0.85, + "step": 8958 + }, + { + "epoch": 4.061196736174071, + "grad_norm": 0.1358992292506874, + "learning_rate": 8.288034778841e-06, + "loss": 0.8676, + "step": 8959 + }, + { + "epoch": 4.061650045330916, + "grad_norm": 0.11284338333799473, + "learning_rate": 8.280321055843568e-06, + "loss": 0.8507, + "step": 8960 + }, + { + "epoch": 4.0621033544877605, + "grad_norm": 0.11388282401197465, + "learning_rate": 8.272610509665191e-06, + "loss": 0.847, + "step": 8961 + }, + { + "epoch": 4.062556663644606, + "grad_norm": 0.11243737101663509, + "learning_rate": 8.264903141078124e-06, + "loss": 0.8492, + "step": 8962 + }, + { + "epoch": 4.06300997280145, + "grad_norm": 0.12228355078924327, + "learning_rate": 8.257198950854271e-06, + "loss": 0.8452, + "step": 8963 + }, + { + "epoch": 4.063463281958295, + "grad_norm": 0.10670492258070628, + "learning_rate": 8.249497939765238e-06, + "loss": 0.8499, + "step": 8964 + }, + { + "epoch": 4.063916591115141, + "grad_norm": 0.12066594024853496, + "learning_rate": 8.241800108582292e-06, + "loss": 0.8796, + "step": 8965 + }, + { + "epoch": 4.064369900271985, + "grad_norm": 0.10703009776620899, + "learning_rate": 8.234105458076422e-06, + "loss": 0.8474, + "step": 8966 + }, + { + "epoch": 4.06482320942883, + "grad_norm": 0.11277013263859183, + "learning_rate": 8.226413989018223e-06, + "loss": 0.865, + "step": 8967 + }, + { + "epoch": 4.065276518585676, + "grad_norm": 0.10932866204557966, + "learning_rate": 8.218725702178058e-06, + "loss": 0.8526, + "step": 8968 + }, + { + "epoch": 4.06572982774252, + "grad_norm": 0.11716385456735867, + "learning_rate": 8.21104059832591e-06, + "loss": 0.8773, + "step": 8969 + }, + { + "epoch": 4.066183136899365, + "grad_norm": 0.1098449588896559, + "learning_rate": 8.203358678231477e-06, + "loss": 0.8411, + "step": 8970 + }, + { + "epoch": 4.0666364460562106, + "grad_norm": 0.1468388961903274, + "learning_rate": 8.195679942664112e-06, + "loss": 0.8535, + "step": 8971 + }, + { + "epoch": 4.067089755213055, + "grad_norm": 0.10907941179058932, + "learning_rate": 8.188004392392886e-06, + "loss": 0.8593, + "step": 8972 + }, + { + "epoch": 4.0675430643699, + "grad_norm": 0.12346092113292302, + "learning_rate": 8.180332028186498e-06, + "loss": 0.8531, + "step": 8973 + }, + { + "epoch": 4.0679963735267455, + "grad_norm": 0.12121900196962161, + "learning_rate": 8.172662850813387e-06, + "loss": 0.824, + "step": 8974 + }, + { + "epoch": 4.06844968268359, + "grad_norm": 0.1273420141133991, + "learning_rate": 8.164996861041632e-06, + "loss": 0.8483, + "step": 8975 + }, + { + "epoch": 4.068902991840435, + "grad_norm": 0.14017700085089213, + "learning_rate": 8.157334059638989e-06, + "loss": 0.858, + "step": 8976 + }, + { + "epoch": 4.06935630099728, + "grad_norm": 0.13247510497528156, + "learning_rate": 8.149674447372934e-06, + "loss": 0.8579, + "step": 8977 + }, + { + "epoch": 4.069809610154125, + "grad_norm": 0.13536517824159572, + "learning_rate": 8.142018025010583e-06, + "loss": 0.8422, + "step": 8978 + }, + { + "epoch": 4.07026291931097, + "grad_norm": 0.1063894368992025, + "learning_rate": 8.134364793318762e-06, + "loss": 0.8492, + "step": 8979 + }, + { + "epoch": 4.070716228467815, + "grad_norm": 0.1373833561565556, + "learning_rate": 8.126714753063952e-06, + "loss": 0.8431, + "step": 8980 + }, + { + "epoch": 4.07116953762466, + "grad_norm": 0.12336714916809881, + "learning_rate": 8.119067905012339e-06, + "loss": 0.8671, + "step": 8981 + }, + { + "epoch": 4.071622846781505, + "grad_norm": 0.12527202372829768, + "learning_rate": 8.111424249929762e-06, + "loss": 0.8441, + "step": 8982 + }, + { + "epoch": 4.07207615593835, + "grad_norm": 0.12311558702232768, + "learning_rate": 8.103783788581779e-06, + "loss": 0.8495, + "step": 8983 + }, + { + "epoch": 4.072529465095195, + "grad_norm": 0.11380771843042224, + "learning_rate": 8.096146521733579e-06, + "loss": 0.8463, + "step": 8984 + }, + { + "epoch": 4.07298277425204, + "grad_norm": 0.11912476230242883, + "learning_rate": 8.088512450150076e-06, + "loss": 0.8684, + "step": 8985 + }, + { + "epoch": 4.073436083408885, + "grad_norm": 0.13780013605564864, + "learning_rate": 8.080881574595842e-06, + "loss": 0.8501, + "step": 8986 + }, + { + "epoch": 4.07388939256573, + "grad_norm": 0.11269516914300969, + "learning_rate": 8.073253895835122e-06, + "loss": 0.8555, + "step": 8987 + }, + { + "epoch": 4.074342701722575, + "grad_norm": 0.12288320619092298, + "learning_rate": 8.065629414631847e-06, + "loss": 0.8483, + "step": 8988 + }, + { + "epoch": 4.07479601087942, + "grad_norm": 0.11503098742373352, + "learning_rate": 8.058008131749653e-06, + "loss": 0.8457, + "step": 8989 + }, + { + "epoch": 4.0752493200362645, + "grad_norm": 0.11443739051392744, + "learning_rate": 8.050390047951819e-06, + "loss": 0.8327, + "step": 8990 + }, + { + "epoch": 4.07570262919311, + "grad_norm": 0.11863294631335977, + "learning_rate": 8.042775164001307e-06, + "loss": 0.8749, + "step": 8991 + }, + { + "epoch": 4.076155938349955, + "grad_norm": 0.10906837550351502, + "learning_rate": 8.035163480660792e-06, + "loss": 0.8611, + "step": 8992 + }, + { + "epoch": 4.076609247506799, + "grad_norm": 0.13661900770877455, + "learning_rate": 8.027554998692592e-06, + "loss": 0.8723, + "step": 8993 + }, + { + "epoch": 4.077062556663645, + "grad_norm": 0.11837203864747993, + "learning_rate": 8.019949718858732e-06, + "loss": 0.8366, + "step": 8994 + }, + { + "epoch": 4.07751586582049, + "grad_norm": 0.1290024014487687, + "learning_rate": 8.012347641920883e-06, + "loss": 0.8651, + "step": 8995 + }, + { + "epoch": 4.077969174977334, + "grad_norm": 0.11806784636216891, + "learning_rate": 8.00474876864044e-06, + "loss": 0.8549, + "step": 8996 + }, + { + "epoch": 4.07842248413418, + "grad_norm": 0.12880452235103232, + "learning_rate": 7.99715309977843e-06, + "loss": 0.8534, + "step": 8997 + }, + { + "epoch": 4.078875793291025, + "grad_norm": 0.12260343674985606, + "learning_rate": 7.989560636095603e-06, + "loss": 0.8522, + "step": 8998 + }, + { + "epoch": 4.079329102447869, + "grad_norm": 0.13722634944686107, + "learning_rate": 7.98197137835234e-06, + "loss": 0.8557, + "step": 8999 + }, + { + "epoch": 4.0797824116047146, + "grad_norm": 0.12775628577269776, + "learning_rate": 7.97438532730876e-06, + "loss": 0.8453, + "step": 9000 + }, + { + "epoch": 4.080235720761559, + "grad_norm": 0.13665582571135834, + "learning_rate": 7.966802483724607e-06, + "loss": 0.8327, + "step": 9001 + }, + { + "epoch": 4.080689029918404, + "grad_norm": 0.14048949727883478, + "learning_rate": 7.959222848359318e-06, + "loss": 0.8586, + "step": 9002 + }, + { + "epoch": 4.0811423390752495, + "grad_norm": 0.1271314329417991, + "learning_rate": 7.95164642197204e-06, + "loss": 0.8524, + "step": 9003 + }, + { + "epoch": 4.081595648232094, + "grad_norm": 0.13554354538895486, + "learning_rate": 7.944073205321547e-06, + "loss": 0.8472, + "step": 9004 + }, + { + "epoch": 4.082048957388939, + "grad_norm": 0.11285438770625945, + "learning_rate": 7.936503199166346e-06, + "loss": 0.882, + "step": 9005 + }, + { + "epoch": 4.082502266545784, + "grad_norm": 0.1351688658078771, + "learning_rate": 7.928936404264585e-06, + "loss": 0.8537, + "step": 9006 + }, + { + "epoch": 4.082955575702629, + "grad_norm": 0.1144240133443146, + "learning_rate": 7.921372821374093e-06, + "loss": 0.8761, + "step": 9007 + }, + { + "epoch": 4.083408884859474, + "grad_norm": 0.10973157137024149, + "learning_rate": 7.913812451252383e-06, + "loss": 0.8559, + "step": 9008 + }, + { + "epoch": 4.083862194016319, + "grad_norm": 0.12869290227814775, + "learning_rate": 7.906255294656665e-06, + "loss": 0.859, + "step": 9009 + }, + { + "epoch": 4.084315503173164, + "grad_norm": 0.11858281176770814, + "learning_rate": 7.898701352343794e-06, + "loss": 0.8342, + "step": 9010 + }, + { + "epoch": 4.084768812330009, + "grad_norm": 0.13887567721337266, + "learning_rate": 7.891150625070332e-06, + "loss": 0.8705, + "step": 9011 + }, + { + "epoch": 4.085222121486854, + "grad_norm": 0.12904893406327686, + "learning_rate": 7.88360311359249e-06, + "loss": 0.8284, + "step": 9012 + }, + { + "epoch": 4.085675430643699, + "grad_norm": 0.14690771919170706, + "learning_rate": 7.8760588186662e-06, + "loss": 0.8419, + "step": 9013 + }, + { + "epoch": 4.086128739800544, + "grad_norm": 0.11995078147554958, + "learning_rate": 7.868517741047017e-06, + "loss": 0.8686, + "step": 9014 + }, + { + "epoch": 4.086582048957389, + "grad_norm": 0.14499722662569478, + "learning_rate": 7.860979881490225e-06, + "loss": 0.8451, + "step": 9015 + }, + { + "epoch": 4.087035358114234, + "grad_norm": 0.13783205307314864, + "learning_rate": 7.853445240750751e-06, + "loss": 0.8524, + "step": 9016 + }, + { + "epoch": 4.087488667271079, + "grad_norm": 0.1971416821813805, + "learning_rate": 7.84591381958321e-06, + "loss": 0.8759, + "step": 9017 + }, + { + "epoch": 4.087941976427924, + "grad_norm": 0.12597335983445296, + "learning_rate": 7.838385618741901e-06, + "loss": 0.8621, + "step": 9018 + }, + { + "epoch": 4.0883952855847685, + "grad_norm": 0.13457835163500306, + "learning_rate": 7.830860638980788e-06, + "loss": 0.8419, + "step": 9019 + }, + { + "epoch": 4.088848594741614, + "grad_norm": 0.1373612440507375, + "learning_rate": 7.823338881053533e-06, + "loss": 0.8527, + "step": 9020 + }, + { + "epoch": 4.089301903898459, + "grad_norm": 0.1284656884452861, + "learning_rate": 7.815820345713443e-06, + "loss": 0.8459, + "step": 9021 + }, + { + "epoch": 4.089755213055303, + "grad_norm": 0.1320127556031706, + "learning_rate": 7.808305033713548e-06, + "loss": 0.8613, + "step": 9022 + }, + { + "epoch": 4.090208522212149, + "grad_norm": 0.13659601105470806, + "learning_rate": 7.800792945806499e-06, + "loss": 0.8633, + "step": 9023 + }, + { + "epoch": 4.090661831368994, + "grad_norm": 0.12937386229059605, + "learning_rate": 7.79328408274469e-06, + "loss": 0.8718, + "step": 9024 + }, + { + "epoch": 4.091115140525838, + "grad_norm": 0.1382751583364022, + "learning_rate": 7.785778445280115e-06, + "loss": 0.8499, + "step": 9025 + }, + { + "epoch": 4.091568449682684, + "grad_norm": 0.10914383177855659, + "learning_rate": 7.778276034164514e-06, + "loss": 0.8474, + "step": 9026 + }, + { + "epoch": 4.092021758839529, + "grad_norm": 0.16206575643905757, + "learning_rate": 7.770776850149256e-06, + "loss": 0.8491, + "step": 9027 + }, + { + "epoch": 4.092475067996373, + "grad_norm": 0.13816019527944118, + "learning_rate": 7.763280893985423e-06, + "loss": 0.8578, + "step": 9028 + }, + { + "epoch": 4.0929283771532186, + "grad_norm": 0.15303615177688254, + "learning_rate": 7.755788166423746e-06, + "loss": 0.8869, + "step": 9029 + }, + { + "epoch": 4.093381686310064, + "grad_norm": 0.13903435985061363, + "learning_rate": 7.748298668214654e-06, + "loss": 0.8441, + "step": 9030 + }, + { + "epoch": 4.093834995466908, + "grad_norm": 0.13081249467470416, + "learning_rate": 7.74081240010823e-06, + "loss": 0.8745, + "step": 9031 + }, + { + "epoch": 4.0942883046237535, + "grad_norm": 0.13897458950054764, + "learning_rate": 7.733329362854261e-06, + "loss": 0.8649, + "step": 9032 + }, + { + "epoch": 4.094741613780599, + "grad_norm": 0.15260374310543107, + "learning_rate": 7.725849557202183e-06, + "loss": 0.8583, + "step": 9033 + }, + { + "epoch": 4.095194922937443, + "grad_norm": 0.12710496985341038, + "learning_rate": 7.718372983901114e-06, + "loss": 0.881, + "step": 9034 + }, + { + "epoch": 4.095648232094288, + "grad_norm": 0.15797737317151986, + "learning_rate": 7.710899643699878e-06, + "loss": 0.8396, + "step": 9035 + }, + { + "epoch": 4.096101541251134, + "grad_norm": 0.138491001153014, + "learning_rate": 7.70342953734693e-06, + "loss": 0.8412, + "step": 9036 + }, + { + "epoch": 4.096554850407978, + "grad_norm": 0.14624531170813362, + "learning_rate": 7.695962665590438e-06, + "loss": 0.8326, + "step": 9037 + }, + { + "epoch": 4.097008159564823, + "grad_norm": 0.15740670582685098, + "learning_rate": 7.688499029178213e-06, + "loss": 0.8278, + "step": 9038 + }, + { + "epoch": 4.097461468721669, + "grad_norm": 0.13835488323087203, + "learning_rate": 7.68103862885779e-06, + "loss": 0.8592, + "step": 9039 + }, + { + "epoch": 4.097914777878513, + "grad_norm": 0.11759332490306096, + "learning_rate": 7.673581465376312e-06, + "loss": 0.8371, + "step": 9040 + }, + { + "epoch": 4.098368087035358, + "grad_norm": 0.14014521381621303, + "learning_rate": 7.666127539480674e-06, + "loss": 0.8588, + "step": 9041 + }, + { + "epoch": 4.098821396192203, + "grad_norm": 0.7326475531167587, + "learning_rate": 7.658676851917386e-06, + "loss": 0.8463, + "step": 9042 + }, + { + "epoch": 4.099274705349048, + "grad_norm": 0.17051386560099382, + "learning_rate": 7.651229403432663e-06, + "loss": 0.849, + "step": 9043 + }, + { + "epoch": 4.099728014505893, + "grad_norm": 0.12518931917692847, + "learning_rate": 7.643785194772375e-06, + "loss": 0.8579, + "step": 9044 + }, + { + "epoch": 4.100181323662738, + "grad_norm": 0.13881498760519556, + "learning_rate": 7.636344226682104e-06, + "loss": 0.8522, + "step": 9045 + }, + { + "epoch": 4.100634632819583, + "grad_norm": 0.15479543872159218, + "learning_rate": 7.628906499907063e-06, + "loss": 0.8782, + "step": 9046 + }, + { + "epoch": 4.101087941976428, + "grad_norm": 0.12602846222867695, + "learning_rate": 7.6214720151921795e-06, + "loss": 0.8562, + "step": 9047 + }, + { + "epoch": 4.1015412511332725, + "grad_norm": 0.11570495881362565, + "learning_rate": 7.614040773282036e-06, + "loss": 0.8679, + "step": 9048 + }, + { + "epoch": 4.101994560290118, + "grad_norm": 0.12637143900947062, + "learning_rate": 7.606612774920883e-06, + "loss": 0.8565, + "step": 9049 + }, + { + "epoch": 4.102447869446963, + "grad_norm": 0.12443516788033097, + "learning_rate": 7.599188020852666e-06, + "loss": 0.8572, + "step": 9050 + }, + { + "epoch": 4.102901178603807, + "grad_norm": 0.12804651927303826, + "learning_rate": 7.591766511820986e-06, + "loss": 0.8655, + "step": 9051 + }, + { + "epoch": 4.103354487760653, + "grad_norm": 0.1287770982703611, + "learning_rate": 7.584348248569147e-06, + "loss": 0.8854, + "step": 9052 + }, + { + "epoch": 4.103807796917498, + "grad_norm": 0.12494182439431976, + "learning_rate": 7.576933231840087e-06, + "loss": 0.844, + "step": 9053 + }, + { + "epoch": 4.104261106074342, + "grad_norm": 0.1263292124800653, + "learning_rate": 7.569521462376466e-06, + "loss": 0.8588, + "step": 9054 + }, + { + "epoch": 4.104714415231188, + "grad_norm": 0.12660506865060964, + "learning_rate": 7.56211294092057e-06, + "loss": 0.8557, + "step": 9055 + }, + { + "epoch": 4.105167724388033, + "grad_norm": 0.12615062865924248, + "learning_rate": 7.554707668214405e-06, + "loss": 0.8535, + "step": 9056 + }, + { + "epoch": 4.105621033544877, + "grad_norm": 0.12673547758938541, + "learning_rate": 7.547305644999614e-06, + "loss": 0.8534, + "step": 9057 + }, + { + "epoch": 4.106074342701723, + "grad_norm": 0.1303764280262862, + "learning_rate": 7.5399068720175505e-06, + "loss": 0.8634, + "step": 9058 + }, + { + "epoch": 4.106527651858568, + "grad_norm": 0.12592511269096351, + "learning_rate": 7.532511350009208e-06, + "loss": 0.8555, + "step": 9059 + }, + { + "epoch": 4.106980961015412, + "grad_norm": 0.12920386772078565, + "learning_rate": 7.525119079715271e-06, + "loss": 0.8517, + "step": 9060 + }, + { + "epoch": 4.1074342701722575, + "grad_norm": 0.11931940062570583, + "learning_rate": 7.517730061876105e-06, + "loss": 0.8542, + "step": 9061 + }, + { + "epoch": 4.107887579329103, + "grad_norm": 0.13204866784132874, + "learning_rate": 7.510344297231737e-06, + "loss": 0.865, + "step": 9062 + }, + { + "epoch": 4.108340888485947, + "grad_norm": 0.10737235958347198, + "learning_rate": 7.502961786521874e-06, + "loss": 0.841, + "step": 9063 + }, + { + "epoch": 4.108794197642792, + "grad_norm": 0.13046726266455574, + "learning_rate": 7.495582530485883e-06, + "loss": 0.8325, + "step": 9064 + }, + { + "epoch": 4.109247506799638, + "grad_norm": 0.10901737307426237, + "learning_rate": 7.48820652986284e-06, + "loss": 0.8504, + "step": 9065 + }, + { + "epoch": 4.109700815956482, + "grad_norm": 0.13152771190302343, + "learning_rate": 7.480833785391457e-06, + "loss": 0.833, + "step": 9066 + }, + { + "epoch": 4.110154125113327, + "grad_norm": 0.11708724387648557, + "learning_rate": 7.4734642978101515e-06, + "loss": 0.8489, + "step": 9067 + }, + { + "epoch": 4.110607434270173, + "grad_norm": 0.10152313108839744, + "learning_rate": 7.466098067856977e-06, + "loss": 0.8406, + "step": 9068 + }, + { + "epoch": 4.111060743427017, + "grad_norm": 0.11041290890896108, + "learning_rate": 7.458735096269714e-06, + "loss": 0.8654, + "step": 9069 + }, + { + "epoch": 4.111514052583862, + "grad_norm": 0.1072296113064941, + "learning_rate": 7.451375383785748e-06, + "loss": 0.8426, + "step": 9070 + }, + { + "epoch": 4.1119673617407075, + "grad_norm": 0.11101729677184924, + "learning_rate": 7.444018931142212e-06, + "loss": 0.8576, + "step": 9071 + }, + { + "epoch": 4.112420670897552, + "grad_norm": 0.11827926636591171, + "learning_rate": 7.4366657390758524e-06, + "loss": 0.8577, + "step": 9072 + }, + { + "epoch": 4.112873980054397, + "grad_norm": 0.11313621979129918, + "learning_rate": 7.429315808323125e-06, + "loss": 0.8377, + "step": 9073 + }, + { + "epoch": 4.1133272892112425, + "grad_norm": 0.11292406951488035, + "learning_rate": 7.421969139620149e-06, + "loss": 0.8603, + "step": 9074 + }, + { + "epoch": 4.113780598368087, + "grad_norm": 0.11576759445887198, + "learning_rate": 7.414625733702694e-06, + "loss": 0.8411, + "step": 9075 + }, + { + "epoch": 4.114233907524932, + "grad_norm": 0.135902706899065, + "learning_rate": 7.407285591306248e-06, + "loss": 0.8775, + "step": 9076 + }, + { + "epoch": 4.114687216681777, + "grad_norm": 0.11811175112542013, + "learning_rate": 7.39994871316593e-06, + "loss": 0.8525, + "step": 9077 + }, + { + "epoch": 4.115140525838622, + "grad_norm": 0.12405393316270072, + "learning_rate": 7.392615100016569e-06, + "loss": 0.861, + "step": 9078 + }, + { + "epoch": 4.115593834995467, + "grad_norm": 0.11601142814005486, + "learning_rate": 7.3852847525926274e-06, + "loss": 0.8458, + "step": 9079 + }, + { + "epoch": 4.116047144152311, + "grad_norm": 0.12403802925217497, + "learning_rate": 7.377957671628277e-06, + "loss": 0.8369, + "step": 9080 + }, + { + "epoch": 4.116500453309157, + "grad_norm": 0.11428187080689292, + "learning_rate": 7.370633857857337e-06, + "loss": 0.8463, + "step": 9081 + }, + { + "epoch": 4.116953762466002, + "grad_norm": 0.13654403034197515, + "learning_rate": 7.363313312013325e-06, + "loss": 0.8502, + "step": 9082 + }, + { + "epoch": 4.117407071622846, + "grad_norm": 0.10506747788131507, + "learning_rate": 7.355996034829384e-06, + "loss": 0.8538, + "step": 9083 + }, + { + "epoch": 4.117860380779692, + "grad_norm": 0.1202219852951705, + "learning_rate": 7.3486820270383915e-06, + "loss": 0.8422, + "step": 9084 + }, + { + "epoch": 4.118313689936537, + "grad_norm": 0.1140819701624222, + "learning_rate": 7.341371289372845e-06, + "loss": 0.8486, + "step": 9085 + }, + { + "epoch": 4.118766999093381, + "grad_norm": 0.10065512236322083, + "learning_rate": 7.334063822564954e-06, + "loss": 0.8603, + "step": 9086 + }, + { + "epoch": 4.119220308250227, + "grad_norm": 0.11297997673129163, + "learning_rate": 7.3267596273465645e-06, + "loss": 0.8611, + "step": 9087 + }, + { + "epoch": 4.119673617407072, + "grad_norm": 0.1040671060084105, + "learning_rate": 7.319458704449234e-06, + "loss": 0.8553, + "step": 9088 + }, + { + "epoch": 4.120126926563916, + "grad_norm": 0.11448290811226014, + "learning_rate": 7.3121610546041586e-06, + "loss": 0.8403, + "step": 9089 + }, + { + "epoch": 4.1205802357207615, + "grad_norm": 0.1164970713669212, + "learning_rate": 7.304866678542213e-06, + "loss": 0.8518, + "step": 9090 + }, + { + "epoch": 4.121033544877607, + "grad_norm": 0.1098105048781621, + "learning_rate": 7.2975755769939675e-06, + "loss": 0.8463, + "step": 9091 + }, + { + "epoch": 4.121486854034451, + "grad_norm": 0.12574690917176026, + "learning_rate": 7.290287750689629e-06, + "loss": 0.8276, + "step": 9092 + }, + { + "epoch": 4.121940163191296, + "grad_norm": 0.11461684311177497, + "learning_rate": 7.283003200359116e-06, + "loss": 0.8531, + "step": 9093 + }, + { + "epoch": 4.122393472348142, + "grad_norm": 0.11038071726741833, + "learning_rate": 7.2757219267319735e-06, + "loss": 0.8519, + "step": 9094 + }, + { + "epoch": 4.122846781504986, + "grad_norm": 0.11323196102400976, + "learning_rate": 7.2684439305374674e-06, + "loss": 0.8507, + "step": 9095 + }, + { + "epoch": 4.123300090661831, + "grad_norm": 0.12180430807335434, + "learning_rate": 7.261169212504487e-06, + "loss": 0.8647, + "step": 9096 + }, + { + "epoch": 4.123753399818677, + "grad_norm": 0.11658862461566208, + "learning_rate": 7.253897773361638e-06, + "loss": 0.8489, + "step": 9097 + }, + { + "epoch": 4.124206708975521, + "grad_norm": 0.1255040529193668, + "learning_rate": 7.246629613837153e-06, + "loss": 0.8514, + "step": 9098 + }, + { + "epoch": 4.124660018132366, + "grad_norm": 0.10791350435792045, + "learning_rate": 7.239364734658986e-06, + "loss": 0.8456, + "step": 9099 + }, + { + "epoch": 4.1251133272892115, + "grad_norm": 0.12401283068257704, + "learning_rate": 7.232103136554722e-06, + "loss": 0.8424, + "step": 9100 + }, + { + "epoch": 4.125566636446056, + "grad_norm": 0.1124026202979358, + "learning_rate": 7.224844820251631e-06, + "loss": 0.8271, + "step": 9101 + }, + { + "epoch": 4.126019945602901, + "grad_norm": 0.11178944927715072, + "learning_rate": 7.217589786476647e-06, + "loss": 0.84, + "step": 9102 + }, + { + "epoch": 4.1264732547597465, + "grad_norm": 0.11949087673510204, + "learning_rate": 7.210338035956401e-06, + "loss": 0.8687, + "step": 9103 + }, + { + "epoch": 4.126926563916591, + "grad_norm": 0.11803676886092444, + "learning_rate": 7.203089569417167e-06, + "loss": 0.8684, + "step": 9104 + }, + { + "epoch": 4.127379873073436, + "grad_norm": 0.11756518089664175, + "learning_rate": 7.19584438758489e-06, + "loss": 0.8515, + "step": 9105 + }, + { + "epoch": 4.127833182230281, + "grad_norm": 0.12579089163913584, + "learning_rate": 7.188602491185217e-06, + "loss": 0.882, + "step": 9106 + }, + { + "epoch": 4.128286491387126, + "grad_norm": 0.13782560847658534, + "learning_rate": 7.1813638809434285e-06, + "loss": 0.8679, + "step": 9107 + }, + { + "epoch": 4.128739800543971, + "grad_norm": 0.1437528517837005, + "learning_rate": 7.1741285575845076e-06, + "loss": 0.8394, + "step": 9108 + }, + { + "epoch": 4.129193109700816, + "grad_norm": 0.13481852222844323, + "learning_rate": 7.166896521833076e-06, + "loss": 0.8445, + "step": 9109 + }, + { + "epoch": 4.129646418857661, + "grad_norm": 0.115498567459327, + "learning_rate": 7.1596677744134635e-06, + "loss": 0.8518, + "step": 9110 + }, + { + "epoch": 4.130099728014506, + "grad_norm": 0.12917775331761872, + "learning_rate": 7.152442316049626e-06, + "loss": 0.8812, + "step": 9111 + }, + { + "epoch": 4.130553037171351, + "grad_norm": 0.11640566515282876, + "learning_rate": 7.1452201474652415e-06, + "loss": 0.8356, + "step": 9112 + }, + { + "epoch": 4.131006346328196, + "grad_norm": 0.11243599982182712, + "learning_rate": 7.138001269383603e-06, + "loss": 0.8494, + "step": 9113 + }, + { + "epoch": 4.131459655485041, + "grad_norm": 0.13872077841182642, + "learning_rate": 7.130785682527732e-06, + "loss": 0.8465, + "step": 9114 + }, + { + "epoch": 4.131912964641886, + "grad_norm": 0.1064427186892733, + "learning_rate": 7.123573387620273e-06, + "loss": 0.8536, + "step": 9115 + }, + { + "epoch": 4.132366273798731, + "grad_norm": 0.1359527199112936, + "learning_rate": 7.116364385383549e-06, + "loss": 0.8411, + "step": 9116 + }, + { + "epoch": 4.132819582955576, + "grad_norm": 0.10969678779514005, + "learning_rate": 7.109158676539589e-06, + "loss": 0.8491, + "step": 9117 + }, + { + "epoch": 4.133272892112421, + "grad_norm": 0.13131595641362045, + "learning_rate": 7.101956261810041e-06, + "loss": 0.8489, + "step": 9118 + }, + { + "epoch": 4.1337262012692655, + "grad_norm": 0.11373722424857802, + "learning_rate": 7.0947571419162666e-06, + "loss": 0.8532, + "step": 9119 + }, + { + "epoch": 4.134179510426111, + "grad_norm": 0.11285153578659043, + "learning_rate": 7.087561317579275e-06, + "loss": 0.8604, + "step": 9120 + }, + { + "epoch": 4.134632819582956, + "grad_norm": 0.12887576088691813, + "learning_rate": 7.080368789519743e-06, + "loss": 0.8438, + "step": 9121 + }, + { + "epoch": 4.1350861287398, + "grad_norm": 0.12360425396285449, + "learning_rate": 7.0731795584580145e-06, + "loss": 0.842, + "step": 9122 + }, + { + "epoch": 4.135539437896646, + "grad_norm": 0.1294022293749803, + "learning_rate": 7.065993625114136e-06, + "loss": 0.8665, + "step": 9123 + }, + { + "epoch": 4.13599274705349, + "grad_norm": 0.1181721823722193, + "learning_rate": 7.058810990207776e-06, + "loss": 0.8493, + "step": 9124 + }, + { + "epoch": 4.136446056210335, + "grad_norm": 0.1266110118235886, + "learning_rate": 7.05163165445832e-06, + "loss": 0.838, + "step": 9125 + }, + { + "epoch": 4.136899365367181, + "grad_norm": 0.12076625899936964, + "learning_rate": 7.0444556185847736e-06, + "loss": 0.8463, + "step": 9126 + }, + { + "epoch": 4.137352674524025, + "grad_norm": 0.11730972356090862, + "learning_rate": 7.037282883305865e-06, + "loss": 0.8411, + "step": 9127 + }, + { + "epoch": 4.13780598368087, + "grad_norm": 0.10184597876613248, + "learning_rate": 7.0301134493399395e-06, + "loss": 0.8467, + "step": 9128 + }, + { + "epoch": 4.1382592928377155, + "grad_norm": 0.12756540432106217, + "learning_rate": 7.022947317405058e-06, + "loss": 0.8713, + "step": 9129 + }, + { + "epoch": 4.13871260199456, + "grad_norm": 0.11848571591741076, + "learning_rate": 7.015784488218922e-06, + "loss": 0.8689, + "step": 9130 + }, + { + "epoch": 4.139165911151405, + "grad_norm": 0.14255056472124317, + "learning_rate": 7.0086249624988955e-06, + "loss": 0.8594, + "step": 9131 + }, + { + "epoch": 4.1396192203082505, + "grad_norm": 0.10470342273864525, + "learning_rate": 7.001468740962049e-06, + "loss": 0.846, + "step": 9132 + }, + { + "epoch": 4.140072529465095, + "grad_norm": 0.1384317349040384, + "learning_rate": 6.994315824325082e-06, + "loss": 0.8399, + "step": 9133 + }, + { + "epoch": 4.14052583862194, + "grad_norm": 0.12224664714311477, + "learning_rate": 6.987166213304397e-06, + "loss": 0.8495, + "step": 9134 + }, + { + "epoch": 4.140979147778785, + "grad_norm": 0.13085701686657034, + "learning_rate": 6.980019908616027e-06, + "loss": 0.8512, + "step": 9135 + }, + { + "epoch": 4.14143245693563, + "grad_norm": 0.1253205297768279, + "learning_rate": 6.9728769109757185e-06, + "loss": 0.8672, + "step": 9136 + }, + { + "epoch": 4.141885766092475, + "grad_norm": 0.12024388930127486, + "learning_rate": 6.965737221098838e-06, + "loss": 0.872, + "step": 9137 + }, + { + "epoch": 4.14233907524932, + "grad_norm": 0.1396735681121534, + "learning_rate": 6.958600839700476e-06, + "loss": 0.8345, + "step": 9138 + }, + { + "epoch": 4.142792384406165, + "grad_norm": 0.1393091112246275, + "learning_rate": 6.951467767495343e-06, + "loss": 0.8647, + "step": 9139 + }, + { + "epoch": 4.14324569356301, + "grad_norm": 0.10944710419640542, + "learning_rate": 6.9443380051978435e-06, + "loss": 0.8633, + "step": 9140 + }, + { + "epoch": 4.143699002719855, + "grad_norm": 0.12419968337604462, + "learning_rate": 6.937211553522032e-06, + "loss": 0.8507, + "step": 9141 + }, + { + "epoch": 4.1441523118767, + "grad_norm": 0.1352893860617036, + "learning_rate": 6.930088413181662e-06, + "loss": 0.8543, + "step": 9142 + }, + { + "epoch": 4.144605621033545, + "grad_norm": 0.12330509858584003, + "learning_rate": 6.922968584890117e-06, + "loss": 0.8509, + "step": 9143 + }, + { + "epoch": 4.14505893019039, + "grad_norm": 0.13365765993148881, + "learning_rate": 6.915852069360491e-06, + "loss": 0.8398, + "step": 9144 + }, + { + "epoch": 4.145512239347235, + "grad_norm": 0.11872607856242338, + "learning_rate": 6.9087388673055156e-06, + "loss": 0.8439, + "step": 9145 + }, + { + "epoch": 4.14596554850408, + "grad_norm": 0.14492369392983961, + "learning_rate": 6.90162897943758e-06, + "loss": 0.858, + "step": 9146 + }, + { + "epoch": 4.146418857660925, + "grad_norm": 0.10580569384839393, + "learning_rate": 6.894522406468791e-06, + "loss": 0.8414, + "step": 9147 + }, + { + "epoch": 4.1468721668177695, + "grad_norm": 0.1280314486045261, + "learning_rate": 6.8874191491108675e-06, + "loss": 0.8468, + "step": 9148 + }, + { + "epoch": 4.147325475974615, + "grad_norm": 0.14945879116438382, + "learning_rate": 6.880319208075241e-06, + "loss": 0.8481, + "step": 9149 + }, + { + "epoch": 4.14777878513146, + "grad_norm": 0.13207572095463985, + "learning_rate": 6.873222584072974e-06, + "loss": 0.8731, + "step": 9150 + }, + { + "epoch": 4.148232094288304, + "grad_norm": 0.15568350853054727, + "learning_rate": 6.86612927781483e-06, + "loss": 0.8538, + "step": 9151 + }, + { + "epoch": 4.14868540344515, + "grad_norm": 0.12281181322086547, + "learning_rate": 6.859039290011211e-06, + "loss": 0.8467, + "step": 9152 + }, + { + "epoch": 4.149138712601995, + "grad_norm": 0.16653108258515986, + "learning_rate": 6.851952621372216e-06, + "loss": 0.859, + "step": 9153 + }, + { + "epoch": 4.149592021758839, + "grad_norm": 0.13202859728769115, + "learning_rate": 6.844869272607577e-06, + "loss": 0.8547, + "step": 9154 + }, + { + "epoch": 4.150045330915685, + "grad_norm": 0.15122238003460273, + "learning_rate": 6.837789244426729e-06, + "loss": 0.8776, + "step": 9155 + }, + { + "epoch": 4.15049864007253, + "grad_norm": 0.11974182706114833, + "learning_rate": 6.8307125375387525e-06, + "loss": 0.8713, + "step": 9156 + }, + { + "epoch": 4.150951949229374, + "grad_norm": 0.12513830622457678, + "learning_rate": 6.823639152652384e-06, + "loss": 0.8744, + "step": 9157 + }, + { + "epoch": 4.1514052583862195, + "grad_norm": 0.13485538262721955, + "learning_rate": 6.816569090476073e-06, + "loss": 0.8578, + "step": 9158 + }, + { + "epoch": 4.151858567543064, + "grad_norm": 0.1097133329446329, + "learning_rate": 6.80950235171789e-06, + "loss": 0.8575, + "step": 9159 + }, + { + "epoch": 4.152311876699909, + "grad_norm": 0.13173016964511677, + "learning_rate": 6.802438937085591e-06, + "loss": 0.8602, + "step": 9160 + }, + { + "epoch": 4.1527651858567545, + "grad_norm": 0.12101927551837638, + "learning_rate": 6.795378847286587e-06, + "loss": 0.8619, + "step": 9161 + }, + { + "epoch": 4.153218495013599, + "grad_norm": 0.12419743928262195, + "learning_rate": 6.78832208302799e-06, + "loss": 0.8542, + "step": 9162 + }, + { + "epoch": 4.153671804170444, + "grad_norm": 0.12456302181299876, + "learning_rate": 6.781268645016532e-06, + "loss": 0.8427, + "step": 9163 + }, + { + "epoch": 4.154125113327289, + "grad_norm": 0.12747309399161616, + "learning_rate": 6.774218533958659e-06, + "loss": 0.8593, + "step": 9164 + }, + { + "epoch": 4.154578422484134, + "grad_norm": 0.12603498501795313, + "learning_rate": 6.76717175056044e-06, + "loss": 0.8802, + "step": 9165 + }, + { + "epoch": 4.155031731640979, + "grad_norm": 0.1294832261636481, + "learning_rate": 6.760128295527648e-06, + "loss": 0.8489, + "step": 9166 + }, + { + "epoch": 4.155485040797824, + "grad_norm": 0.11386387760951619, + "learning_rate": 6.7530881695656845e-06, + "loss": 0.8466, + "step": 9167 + }, + { + "epoch": 4.155938349954669, + "grad_norm": 0.13535199133327933, + "learning_rate": 6.746051373379665e-06, + "loss": 0.84, + "step": 9168 + }, + { + "epoch": 4.156391659111514, + "grad_norm": 0.11919087734945294, + "learning_rate": 6.739017907674324e-06, + "loss": 0.8486, + "step": 9169 + }, + { + "epoch": 4.156844968268359, + "grad_norm": 0.12438336419882083, + "learning_rate": 6.731987773154096e-06, + "loss": 0.8735, + "step": 9170 + }, + { + "epoch": 4.157298277425204, + "grad_norm": 0.11136424192970786, + "learning_rate": 6.724960970523069e-06, + "loss": 0.8573, + "step": 9171 + }, + { + "epoch": 4.157751586582049, + "grad_norm": 0.12374924161559134, + "learning_rate": 6.717937500484985e-06, + "loss": 0.8583, + "step": 9172 + }, + { + "epoch": 4.158204895738894, + "grad_norm": 0.10922609024740722, + "learning_rate": 6.710917363743283e-06, + "loss": 0.8619, + "step": 9173 + }, + { + "epoch": 4.158658204895739, + "grad_norm": 0.11217443120303842, + "learning_rate": 6.703900561001031e-06, + "loss": 0.8655, + "step": 9174 + }, + { + "epoch": 4.159111514052584, + "grad_norm": 0.11427556765563991, + "learning_rate": 6.696887092961004e-06, + "loss": 0.8565, + "step": 9175 + }, + { + "epoch": 4.159564823209429, + "grad_norm": 0.1257088707725748, + "learning_rate": 6.689876960325601e-06, + "loss": 0.8663, + "step": 9176 + }, + { + "epoch": 4.1600181323662735, + "grad_norm": 0.12882749919766875, + "learning_rate": 6.682870163796926e-06, + "loss": 0.8627, + "step": 9177 + }, + { + "epoch": 4.160471441523119, + "grad_norm": 0.12217909386870196, + "learning_rate": 6.675866704076725e-06, + "loss": 0.8499, + "step": 9178 + }, + { + "epoch": 4.160924750679964, + "grad_norm": 0.12940448662964354, + "learning_rate": 6.668866581866407e-06, + "loss": 0.8495, + "step": 9179 + }, + { + "epoch": 4.161378059836808, + "grad_norm": 0.12489022251233196, + "learning_rate": 6.6618697978670536e-06, + "loss": 0.8565, + "step": 9180 + }, + { + "epoch": 4.161831368993654, + "grad_norm": 0.11008951446119612, + "learning_rate": 6.654876352779425e-06, + "loss": 0.8652, + "step": 9181 + }, + { + "epoch": 4.162284678150499, + "grad_norm": 0.10893834955125034, + "learning_rate": 6.647886247303921e-06, + "loss": 0.8613, + "step": 9182 + }, + { + "epoch": 4.162737987307343, + "grad_norm": 0.108040906794142, + "learning_rate": 6.6408994821406395e-06, + "loss": 0.8646, + "step": 9183 + }, + { + "epoch": 4.163191296464189, + "grad_norm": 0.22654620948318063, + "learning_rate": 6.633916057989305e-06, + "loss": 0.8474, + "step": 9184 + }, + { + "epoch": 4.163644605621034, + "grad_norm": 0.10178815784755457, + "learning_rate": 6.626935975549345e-06, + "loss": 0.8534, + "step": 9185 + }, + { + "epoch": 4.164097914777878, + "grad_norm": 0.11408899882482317, + "learning_rate": 6.619959235519831e-06, + "loss": 0.8492, + "step": 9186 + }, + { + "epoch": 4.1645512239347235, + "grad_norm": 0.11902245355390856, + "learning_rate": 6.612985838599492e-06, + "loss": 0.8549, + "step": 9187 + }, + { + "epoch": 4.165004533091569, + "grad_norm": 0.12005825187906455, + "learning_rate": 6.606015785486754e-06, + "loss": 0.8592, + "step": 9188 + }, + { + "epoch": 4.165457842248413, + "grad_norm": 0.11719157312492222, + "learning_rate": 6.599049076879671e-06, + "loss": 0.8588, + "step": 9189 + }, + { + "epoch": 4.1659111514052585, + "grad_norm": 0.1039189148169718, + "learning_rate": 6.592085713475995e-06, + "loss": 0.8641, + "step": 9190 + }, + { + "epoch": 4.166364460562104, + "grad_norm": 0.13544964764049489, + "learning_rate": 6.585125695973107e-06, + "loss": 0.852, + "step": 9191 + }, + { + "epoch": 4.166817769718948, + "grad_norm": 0.12806131534667387, + "learning_rate": 6.578169025068098e-06, + "loss": 0.8439, + "step": 9192 + }, + { + "epoch": 4.167271078875793, + "grad_norm": 0.11235061373506369, + "learning_rate": 6.571215701457671e-06, + "loss": 0.8584, + "step": 9193 + }, + { + "epoch": 4.167724388032639, + "grad_norm": 0.13729015587533513, + "learning_rate": 6.56426572583825e-06, + "loss": 0.8725, + "step": 9194 + }, + { + "epoch": 4.168177697189483, + "grad_norm": 0.1155443795108974, + "learning_rate": 6.5573190989058724e-06, + "loss": 0.8627, + "step": 9195 + }, + { + "epoch": 4.168631006346328, + "grad_norm": 0.1457927531216816, + "learning_rate": 6.550375821356283e-06, + "loss": 0.8671, + "step": 9196 + }, + { + "epoch": 4.169084315503174, + "grad_norm": 0.11566688216447041, + "learning_rate": 6.543435893884864e-06, + "loss": 0.8554, + "step": 9197 + }, + { + "epoch": 4.169537624660018, + "grad_norm": 0.10883366040889204, + "learning_rate": 6.536499317186664e-06, + "loss": 0.8384, + "step": 9198 + }, + { + "epoch": 4.169990933816863, + "grad_norm": 0.11620335148897061, + "learning_rate": 6.5295660919563985e-06, + "loss": 0.8447, + "step": 9199 + }, + { + "epoch": 4.1704442429737085, + "grad_norm": 0.11017150585860916, + "learning_rate": 6.522636218888463e-06, + "loss": 0.8634, + "step": 9200 + }, + { + "epoch": 4.170897552130553, + "grad_norm": 0.12036709280578578, + "learning_rate": 6.515709698676889e-06, + "loss": 0.8561, + "step": 9201 + }, + { + "epoch": 4.171350861287398, + "grad_norm": 0.10154010739155289, + "learning_rate": 6.50878653201541e-06, + "loss": 0.8604, + "step": 9202 + }, + { + "epoch": 4.171804170444243, + "grad_norm": 0.11150739956443212, + "learning_rate": 6.501866719597383e-06, + "loss": 0.8454, + "step": 9203 + }, + { + "epoch": 4.172257479601088, + "grad_norm": 0.10827825105281429, + "learning_rate": 6.4949502621158485e-06, + "loss": 0.87, + "step": 9204 + }, + { + "epoch": 4.172710788757933, + "grad_norm": 0.10616230027699501, + "learning_rate": 6.488037160263521e-06, + "loss": 0.8602, + "step": 9205 + }, + { + "epoch": 4.1731640979147775, + "grad_norm": 0.10407896840097525, + "learning_rate": 6.481127414732755e-06, + "loss": 0.8624, + "step": 9206 + }, + { + "epoch": 4.173617407071623, + "grad_norm": 0.10609493246593091, + "learning_rate": 6.474221026215599e-06, + "loss": 0.8303, + "step": 9207 + }, + { + "epoch": 4.174070716228468, + "grad_norm": 0.09904090451361926, + "learning_rate": 6.467317995403726e-06, + "loss": 0.8525, + "step": 9208 + }, + { + "epoch": 4.174524025385312, + "grad_norm": 0.10961427059233535, + "learning_rate": 6.460418322988516e-06, + "loss": 0.8668, + "step": 9209 + }, + { + "epoch": 4.174977334542158, + "grad_norm": 0.10258405662654571, + "learning_rate": 6.453522009660971e-06, + "loss": 0.8349, + "step": 9210 + }, + { + "epoch": 4.175430643699003, + "grad_norm": 0.12276155844241743, + "learning_rate": 6.446629056111797e-06, + "loss": 0.8575, + "step": 9211 + }, + { + "epoch": 4.175883952855847, + "grad_norm": 0.12000748518575019, + "learning_rate": 6.43973946303134e-06, + "loss": 0.8424, + "step": 9212 + }, + { + "epoch": 4.176337262012693, + "grad_norm": 0.13780716470148374, + "learning_rate": 6.432853231109595e-06, + "loss": 0.8595, + "step": 9213 + }, + { + "epoch": 4.176790571169538, + "grad_norm": 0.11927760703677474, + "learning_rate": 6.425970361036258e-06, + "loss": 0.8395, + "step": 9214 + }, + { + "epoch": 4.177243880326382, + "grad_norm": 0.12872653464591013, + "learning_rate": 6.419090853500654e-06, + "loss": 0.8692, + "step": 9215 + }, + { + "epoch": 4.1776971894832275, + "grad_norm": 0.11949111970823156, + "learning_rate": 6.412214709191804e-06, + "loss": 0.8725, + "step": 9216 + }, + { + "epoch": 4.178150498640073, + "grad_norm": 0.12280602735812184, + "learning_rate": 6.405341928798363e-06, + "loss": 0.8594, + "step": 9217 + }, + { + "epoch": 4.178603807796917, + "grad_norm": 0.12160200769371479, + "learning_rate": 6.3984725130086604e-06, + "loss": 0.8548, + "step": 9218 + }, + { + "epoch": 4.1790571169537625, + "grad_norm": 0.1396094277762603, + "learning_rate": 6.391606462510678e-06, + "loss": 0.8667, + "step": 9219 + }, + { + "epoch": 4.179510426110608, + "grad_norm": 0.12127818142119809, + "learning_rate": 6.384743777992093e-06, + "loss": 0.8569, + "step": 9220 + }, + { + "epoch": 4.179963735267452, + "grad_norm": 0.1200469001409995, + "learning_rate": 6.3778844601402e-06, + "loss": 0.8452, + "step": 9221 + }, + { + "epoch": 4.180417044424297, + "grad_norm": 0.15950672761348575, + "learning_rate": 6.371028509642006e-06, + "loss": 0.867, + "step": 9222 + }, + { + "epoch": 4.180870353581143, + "grad_norm": 0.10537705209927452, + "learning_rate": 6.3641759271841285e-06, + "loss": 0.8621, + "step": 9223 + }, + { + "epoch": 4.181323662737987, + "grad_norm": 0.15534321319570465, + "learning_rate": 6.357326713452896e-06, + "loss": 0.8457, + "step": 9224 + }, + { + "epoch": 4.181776971894832, + "grad_norm": 0.11136282390247067, + "learning_rate": 6.350480869134257e-06, + "loss": 0.8555, + "step": 9225 + }, + { + "epoch": 4.182230281051678, + "grad_norm": 0.12247079409346857, + "learning_rate": 6.343638394913858e-06, + "loss": 0.8498, + "step": 9226 + }, + { + "epoch": 4.182683590208522, + "grad_norm": 0.14513548242413435, + "learning_rate": 6.336799291476983e-06, + "loss": 0.8622, + "step": 9227 + }, + { + "epoch": 4.183136899365367, + "grad_norm": 0.13991275560222655, + "learning_rate": 6.329963559508603e-06, + "loss": 0.8649, + "step": 9228 + }, + { + "epoch": 4.1835902085222125, + "grad_norm": 0.12112986690535807, + "learning_rate": 6.323131199693327e-06, + "loss": 0.8477, + "step": 9229 + }, + { + "epoch": 4.184043517679057, + "grad_norm": 0.11795196287438502, + "learning_rate": 6.3163022127154286e-06, + "loss": 0.8503, + "step": 9230 + }, + { + "epoch": 4.184496826835902, + "grad_norm": 0.15020847609014834, + "learning_rate": 6.309476599258863e-06, + "loss": 0.8525, + "step": 9231 + }, + { + "epoch": 4.1849501359927475, + "grad_norm": 0.12347401004001525, + "learning_rate": 6.302654360007223e-06, + "loss": 0.869, + "step": 9232 + }, + { + "epoch": 4.185403445149592, + "grad_norm": 0.14159036794639798, + "learning_rate": 6.295835495643792e-06, + "loss": 0.8816, + "step": 9233 + }, + { + "epoch": 4.185856754306437, + "grad_norm": 0.10464352834341265, + "learning_rate": 6.289020006851481e-06, + "loss": 0.8565, + "step": 9234 + }, + { + "epoch": 4.186310063463282, + "grad_norm": 0.1378766532499929, + "learning_rate": 6.282207894312912e-06, + "loss": 0.8618, + "step": 9235 + }, + { + "epoch": 4.186763372620127, + "grad_norm": 0.11027075725351405, + "learning_rate": 6.275399158710298e-06, + "loss": 0.8563, + "step": 9236 + }, + { + "epoch": 4.187216681776972, + "grad_norm": 0.10966754240955932, + "learning_rate": 6.2685938007255845e-06, + "loss": 0.8436, + "step": 9237 + }, + { + "epoch": 4.187669990933816, + "grad_norm": 0.12581463362235767, + "learning_rate": 6.261791821040324e-06, + "loss": 0.8654, + "step": 9238 + }, + { + "epoch": 4.188123300090662, + "grad_norm": 0.09816889357869087, + "learning_rate": 6.254993220335776e-06, + "loss": 0.863, + "step": 9239 + }, + { + "epoch": 4.188576609247507, + "grad_norm": 0.13426479174832998, + "learning_rate": 6.248197999292824e-06, + "loss": 0.8421, + "step": 9240 + }, + { + "epoch": 4.189029918404351, + "grad_norm": 0.11242692041117115, + "learning_rate": 6.241406158592047e-06, + "loss": 0.8431, + "step": 9241 + }, + { + "epoch": 4.189483227561197, + "grad_norm": 0.11121630728805025, + "learning_rate": 6.234617698913648e-06, + "loss": 0.8616, + "step": 9242 + }, + { + "epoch": 4.189936536718042, + "grad_norm": 0.11953251047677875, + "learning_rate": 6.227832620937531e-06, + "loss": 0.8718, + "step": 9243 + }, + { + "epoch": 4.190389845874886, + "grad_norm": 0.12821574643950714, + "learning_rate": 6.221050925343228e-06, + "loss": 0.8532, + "step": 9244 + }, + { + "epoch": 4.1908431550317315, + "grad_norm": 0.1216296580140616, + "learning_rate": 6.214272612809944e-06, + "loss": 0.8479, + "step": 9245 + }, + { + "epoch": 4.191296464188577, + "grad_norm": 0.12263616838477921, + "learning_rate": 6.207497684016561e-06, + "loss": 0.866, + "step": 9246 + }, + { + "epoch": 4.191749773345421, + "grad_norm": 0.11687717903551002, + "learning_rate": 6.200726139641587e-06, + "loss": 0.8704, + "step": 9247 + }, + { + "epoch": 4.1922030825022665, + "grad_norm": 0.12776236620454687, + "learning_rate": 6.1939579803632365e-06, + "loss": 0.8668, + "step": 9248 + }, + { + "epoch": 4.192656391659112, + "grad_norm": 0.11084323299879052, + "learning_rate": 6.187193206859339e-06, + "loss": 0.8521, + "step": 9249 + }, + { + "epoch": 4.193109700815956, + "grad_norm": 0.10485771808189624, + "learning_rate": 6.180431819807426e-06, + "loss": 0.8565, + "step": 9250 + }, + { + "epoch": 4.193563009972801, + "grad_norm": 0.11174528607948468, + "learning_rate": 6.173673819884647e-06, + "loss": 0.847, + "step": 9251 + }, + { + "epoch": 4.194016319129647, + "grad_norm": 0.12315020296741666, + "learning_rate": 6.166919207767863e-06, + "loss": 0.8684, + "step": 9252 + }, + { + "epoch": 4.194469628286491, + "grad_norm": 0.1083687961048849, + "learning_rate": 6.160167984133538e-06, + "loss": 0.8572, + "step": 9253 + }, + { + "epoch": 4.194922937443336, + "grad_norm": 0.11068850095924941, + "learning_rate": 6.15342014965787e-06, + "loss": 0.8704, + "step": 9254 + }, + { + "epoch": 4.195376246600182, + "grad_norm": 0.2940550927354745, + "learning_rate": 6.146675705016622e-06, + "loss": 0.8427, + "step": 9255 + }, + { + "epoch": 4.195829555757026, + "grad_norm": 0.10503662350849649, + "learning_rate": 6.1399346508853065e-06, + "loss": 0.8728, + "step": 9256 + }, + { + "epoch": 4.196282864913871, + "grad_norm": 0.10465468811838002, + "learning_rate": 6.133196987939039e-06, + "loss": 0.8237, + "step": 9257 + }, + { + "epoch": 4.1967361740707165, + "grad_norm": 0.09820645428101261, + "learning_rate": 6.126462716852635e-06, + "loss": 0.8532, + "step": 9258 + }, + { + "epoch": 4.197189483227561, + "grad_norm": 0.11623559522807124, + "learning_rate": 6.1197318383005465e-06, + "loss": 0.8388, + "step": 9259 + }, + { + "epoch": 4.197642792384406, + "grad_norm": 0.10861694518726926, + "learning_rate": 6.1130043529568705e-06, + "loss": 0.8532, + "step": 9260 + }, + { + "epoch": 4.1980961015412515, + "grad_norm": 0.11092750179991301, + "learning_rate": 6.106280261495414e-06, + "loss": 0.8338, + "step": 9261 + }, + { + "epoch": 4.198549410698096, + "grad_norm": 0.12564921027752182, + "learning_rate": 6.09955956458959e-06, + "loss": 0.8583, + "step": 9262 + }, + { + "epoch": 4.199002719854941, + "grad_norm": 0.10228026499158978, + "learning_rate": 6.092842262912518e-06, + "loss": 0.8537, + "step": 9263 + }, + { + "epoch": 4.199456029011786, + "grad_norm": 0.10665772670833125, + "learning_rate": 6.086128357136938e-06, + "loss": 0.8445, + "step": 9264 + }, + { + "epoch": 4.199909338168631, + "grad_norm": 0.11414856751345905, + "learning_rate": 6.079417847935278e-06, + "loss": 0.8462, + "step": 9265 + }, + { + "epoch": 4.200362647325476, + "grad_norm": 0.10780656560765514, + "learning_rate": 6.0727107359796054e-06, + "loss": 0.8444, + "step": 9266 + }, + { + "epoch": 4.200815956482321, + "grad_norm": 0.10497611053199622, + "learning_rate": 6.066007021941671e-06, + "loss": 0.8426, + "step": 9267 + }, + { + "epoch": 4.201269265639166, + "grad_norm": 0.1148018409807479, + "learning_rate": 6.059306706492854e-06, + "loss": 0.8638, + "step": 9268 + }, + { + "epoch": 4.201722574796011, + "grad_norm": 0.11469442422733421, + "learning_rate": 6.052609790304229e-06, + "loss": 0.8658, + "step": 9269 + }, + { + "epoch": 4.202175883952856, + "grad_norm": 0.116687500916083, + "learning_rate": 6.045916274046506e-06, + "loss": 0.8659, + "step": 9270 + }, + { + "epoch": 4.202629193109701, + "grad_norm": 0.12375491613635249, + "learning_rate": 6.039226158390046e-06, + "loss": 0.843, + "step": 9271 + }, + { + "epoch": 4.203082502266546, + "grad_norm": 0.11451082027538881, + "learning_rate": 6.032539444004903e-06, + "loss": 0.8667, + "step": 9272 + }, + { + "epoch": 4.203535811423391, + "grad_norm": 0.09406511336076767, + "learning_rate": 6.0258561315607565e-06, + "loss": 0.8572, + "step": 9273 + }, + { + "epoch": 4.2039891205802356, + "grad_norm": 0.10680472371845692, + "learning_rate": 6.019176221726981e-06, + "loss": 0.8527, + "step": 9274 + }, + { + "epoch": 4.204442429737081, + "grad_norm": 0.11961799225000169, + "learning_rate": 6.012499715172561e-06, + "loss": 0.8552, + "step": 9275 + }, + { + "epoch": 4.204895738893926, + "grad_norm": 0.11169344370584648, + "learning_rate": 6.00582661256619e-06, + "loss": 0.8553, + "step": 9276 + }, + { + "epoch": 4.2053490480507705, + "grad_norm": 0.1098964622496317, + "learning_rate": 5.999156914576181e-06, + "loss": 0.8522, + "step": 9277 + }, + { + "epoch": 4.205802357207616, + "grad_norm": 0.12310117963609052, + "learning_rate": 5.992490621870541e-06, + "loss": 0.8317, + "step": 9278 + }, + { + "epoch": 4.20625566636446, + "grad_norm": 0.12694137132306302, + "learning_rate": 5.985827735116902e-06, + "loss": 0.8127, + "step": 9279 + }, + { + "epoch": 4.206708975521305, + "grad_norm": 0.12216874922821466, + "learning_rate": 5.979168254982597e-06, + "loss": 0.8608, + "step": 9280 + }, + { + "epoch": 4.207162284678151, + "grad_norm": 0.1256121070914895, + "learning_rate": 5.9725121821345625e-06, + "loss": 0.8452, + "step": 9281 + }, + { + "epoch": 4.207615593834995, + "grad_norm": 0.11023339291645128, + "learning_rate": 5.965859517239447e-06, + "loss": 0.8348, + "step": 9282 + }, + { + "epoch": 4.20806890299184, + "grad_norm": 0.12728774806571236, + "learning_rate": 5.959210260963515e-06, + "loss": 0.8539, + "step": 9283 + }, + { + "epoch": 4.208522212148686, + "grad_norm": 0.12928002477809147, + "learning_rate": 5.95256441397273e-06, + "loss": 0.8575, + "step": 9284 + }, + { + "epoch": 4.20897552130553, + "grad_norm": 0.1068292384771184, + "learning_rate": 5.94592197693268e-06, + "loss": 0.8495, + "step": 9285 + }, + { + "epoch": 4.209428830462375, + "grad_norm": 0.11215006761974186, + "learning_rate": 5.939282950508625e-06, + "loss": 0.8454, + "step": 9286 + }, + { + "epoch": 4.2098821396192205, + "grad_norm": 0.11541798094565697, + "learning_rate": 5.932647335365489e-06, + "loss": 0.877, + "step": 9287 + }, + { + "epoch": 4.210335448776065, + "grad_norm": 0.11229217034809173, + "learning_rate": 5.926015132167835e-06, + "loss": 0.8541, + "step": 9288 + }, + { + "epoch": 4.21078875793291, + "grad_norm": 0.111668940581694, + "learning_rate": 5.919386341579918e-06, + "loss": 0.8538, + "step": 9289 + }, + { + "epoch": 4.2112420670897555, + "grad_norm": 0.10168638124777007, + "learning_rate": 5.912760964265607e-06, + "loss": 0.8745, + "step": 9290 + }, + { + "epoch": 4.2116953762466, + "grad_norm": 0.11852430187845218, + "learning_rate": 5.906139000888483e-06, + "loss": 0.8613, + "step": 9291 + }, + { + "epoch": 4.212148685403445, + "grad_norm": 0.11097473779527817, + "learning_rate": 5.89952045211172e-06, + "loss": 0.8624, + "step": 9292 + }, + { + "epoch": 4.21260199456029, + "grad_norm": 0.11828328801065756, + "learning_rate": 5.8929053185982255e-06, + "loss": 0.8634, + "step": 9293 + }, + { + "epoch": 4.213055303717135, + "grad_norm": 0.11453126256270789, + "learning_rate": 5.886293601010486e-06, + "loss": 0.8509, + "step": 9294 + }, + { + "epoch": 4.21350861287398, + "grad_norm": 0.10707397802600505, + "learning_rate": 5.879685300010707e-06, + "loss": 0.8504, + "step": 9295 + }, + { + "epoch": 4.213961922030825, + "grad_norm": 0.09985719637640858, + "learning_rate": 5.8730804162607166e-06, + "loss": 0.8544, + "step": 9296 + }, + { + "epoch": 4.21441523118767, + "grad_norm": 0.10473680610300148, + "learning_rate": 5.866478950422028e-06, + "loss": 0.8476, + "step": 9297 + }, + { + "epoch": 4.214868540344515, + "grad_norm": 0.11181689392570071, + "learning_rate": 5.8598809031557764e-06, + "loss": 0.8626, + "step": 9298 + }, + { + "epoch": 4.21532184950136, + "grad_norm": 0.10271800578987517, + "learning_rate": 5.8532862751228e-06, + "loss": 0.8537, + "step": 9299 + }, + { + "epoch": 4.215775158658205, + "grad_norm": 0.09895343601559733, + "learning_rate": 5.846695066983557e-06, + "loss": 0.8587, + "step": 9300 + }, + { + "epoch": 4.21622846781505, + "grad_norm": 0.1191855863690266, + "learning_rate": 5.840107279398171e-06, + "loss": 0.8605, + "step": 9301 + }, + { + "epoch": 4.216681776971895, + "grad_norm": 0.10840337046515217, + "learning_rate": 5.833522913026444e-06, + "loss": 0.8616, + "step": 9302 + }, + { + "epoch": 4.2171350861287396, + "grad_norm": 0.11195823497766329, + "learning_rate": 5.8269419685278e-06, + "loss": 0.8528, + "step": 9303 + }, + { + "epoch": 4.217588395285585, + "grad_norm": 0.11205559945722963, + "learning_rate": 5.8203644465613594e-06, + "loss": 0.8586, + "step": 9304 + }, + { + "epoch": 4.21804170444243, + "grad_norm": 0.10553088688181952, + "learning_rate": 5.8137903477858636e-06, + "loss": 0.8534, + "step": 9305 + }, + { + "epoch": 4.2184950135992745, + "grad_norm": 0.11969664891872349, + "learning_rate": 5.8072196728597455e-06, + "loss": 0.8519, + "step": 9306 + }, + { + "epoch": 4.21894832275612, + "grad_norm": 0.10711410363164875, + "learning_rate": 5.80065242244106e-06, + "loss": 0.8465, + "step": 9307 + }, + { + "epoch": 4.219401631912965, + "grad_norm": 0.13720058698852275, + "learning_rate": 5.794088597187553e-06, + "loss": 0.8593, + "step": 9308 + }, + { + "epoch": 4.219854941069809, + "grad_norm": 0.10261779548215634, + "learning_rate": 5.787528197756591e-06, + "loss": 0.8511, + "step": 9309 + }, + { + "epoch": 4.220308250226655, + "grad_norm": 0.10726231200278503, + "learning_rate": 5.780971224805236e-06, + "loss": 0.8537, + "step": 9310 + }, + { + "epoch": 4.2207615593835, + "grad_norm": 0.10128471280005782, + "learning_rate": 5.774417678990185e-06, + "loss": 0.8219, + "step": 9311 + }, + { + "epoch": 4.221214868540344, + "grad_norm": 0.11135366702566031, + "learning_rate": 5.767867560967788e-06, + "loss": 0.8413, + "step": 9312 + }, + { + "epoch": 4.22166817769719, + "grad_norm": 0.11845038686242822, + "learning_rate": 5.7613208713940535e-06, + "loss": 0.8498, + "step": 9313 + }, + { + "epoch": 4.222121486854035, + "grad_norm": 0.10328276111282299, + "learning_rate": 5.754777610924666e-06, + "loss": 0.8455, + "step": 9314 + }, + { + "epoch": 4.222574796010879, + "grad_norm": 0.13381764990139686, + "learning_rate": 5.748237780214947e-06, + "loss": 0.8639, + "step": 9315 + }, + { + "epoch": 4.2230281051677245, + "grad_norm": 0.12586645054288442, + "learning_rate": 5.741701379919873e-06, + "loss": 0.8673, + "step": 9316 + }, + { + "epoch": 4.223481414324569, + "grad_norm": 0.11400364356419372, + "learning_rate": 5.7351684106940945e-06, + "loss": 0.8725, + "step": 9317 + }, + { + "epoch": 4.223934723481414, + "grad_norm": 0.11408820718776674, + "learning_rate": 5.7286388731918915e-06, + "loss": 0.8561, + "step": 9318 + }, + { + "epoch": 4.2243880326382595, + "grad_norm": 0.10873595425334351, + "learning_rate": 5.72211276806724e-06, + "loss": 0.8659, + "step": 9319 + }, + { + "epoch": 4.224841341795104, + "grad_norm": 0.11405418851362532, + "learning_rate": 5.715590095973725e-06, + "loss": 0.8491, + "step": 9320 + }, + { + "epoch": 4.225294650951949, + "grad_norm": 0.09764628703002207, + "learning_rate": 5.709070857564629e-06, + "loss": 0.8649, + "step": 9321 + }, + { + "epoch": 4.225747960108794, + "grad_norm": 0.10078189761392484, + "learning_rate": 5.702555053492859e-06, + "loss": 0.8472, + "step": 9322 + }, + { + "epoch": 4.226201269265639, + "grad_norm": 0.10770302434907847, + "learning_rate": 5.696042684411005e-06, + "loss": 0.8607, + "step": 9323 + }, + { + "epoch": 4.226654578422484, + "grad_norm": 0.10247852312021948, + "learning_rate": 5.68953375097129e-06, + "loss": 0.8592, + "step": 9324 + }, + { + "epoch": 4.227107887579329, + "grad_norm": 0.11186644405723142, + "learning_rate": 5.683028253825611e-06, + "loss": 0.8715, + "step": 9325 + }, + { + "epoch": 4.227561196736174, + "grad_norm": 0.10763191422609084, + "learning_rate": 5.676526193625509e-06, + "loss": 0.8552, + "step": 9326 + }, + { + "epoch": 4.228014505893019, + "grad_norm": 0.10445597748414612, + "learning_rate": 5.6700275710221746e-06, + "loss": 0.8485, + "step": 9327 + }, + { + "epoch": 4.228467815049864, + "grad_norm": 0.11439752979849928, + "learning_rate": 5.6635323866664815e-06, + "loss": 0.8497, + "step": 9328 + }, + { + "epoch": 4.228921124206709, + "grad_norm": 0.10532789658245938, + "learning_rate": 5.657040641208924e-06, + "loss": 0.8458, + "step": 9329 + }, + { + "epoch": 4.229374433363554, + "grad_norm": 0.10910047900912362, + "learning_rate": 5.650552335299688e-06, + "loss": 0.8534, + "step": 9330 + }, + { + "epoch": 4.229827742520399, + "grad_norm": 0.11213321875075723, + "learning_rate": 5.6440674695885875e-06, + "loss": 0.8329, + "step": 9331 + }, + { + "epoch": 4.2302810516772436, + "grad_norm": 0.11244062115429881, + "learning_rate": 5.637586044725098e-06, + "loss": 0.8338, + "step": 9332 + }, + { + "epoch": 4.230734360834089, + "grad_norm": 0.10296174672664316, + "learning_rate": 5.6311080613583505e-06, + "loss": 0.8392, + "step": 9333 + }, + { + "epoch": 4.231187669990934, + "grad_norm": 0.1080102052999728, + "learning_rate": 5.624633520137144e-06, + "loss": 0.8663, + "step": 9334 + }, + { + "epoch": 4.2316409791477785, + "grad_norm": 0.1080181631597641, + "learning_rate": 5.618162421709912e-06, + "loss": 0.8639, + "step": 9335 + }, + { + "epoch": 4.232094288304624, + "grad_norm": 0.10768750057752138, + "learning_rate": 5.611694766724763e-06, + "loss": 0.8353, + "step": 9336 + }, + { + "epoch": 4.232547597461469, + "grad_norm": 0.10112008038050369, + "learning_rate": 5.605230555829444e-06, + "loss": 0.8492, + "step": 9337 + }, + { + "epoch": 4.233000906618313, + "grad_norm": 0.1278606606792553, + "learning_rate": 5.598769789671377e-06, + "loss": 0.841, + "step": 9338 + }, + { + "epoch": 4.233454215775159, + "grad_norm": 0.1146502756630061, + "learning_rate": 5.5923124688976115e-06, + "loss": 0.8587, + "step": 9339 + }, + { + "epoch": 4.233907524932004, + "grad_norm": 0.11604181836172953, + "learning_rate": 5.5858585941548805e-06, + "loss": 0.8556, + "step": 9340 + }, + { + "epoch": 4.234360834088848, + "grad_norm": 0.13779384238999884, + "learning_rate": 5.579408166089555e-06, + "loss": 0.8748, + "step": 9341 + }, + { + "epoch": 4.234814143245694, + "grad_norm": 0.10871932387124182, + "learning_rate": 5.572961185347652e-06, + "loss": 0.8582, + "step": 9342 + }, + { + "epoch": 4.235267452402539, + "grad_norm": 0.13418790269982767, + "learning_rate": 5.566517652574876e-06, + "loss": 0.8414, + "step": 9343 + }, + { + "epoch": 4.235720761559383, + "grad_norm": 0.10358782609726037, + "learning_rate": 5.560077568416544e-06, + "loss": 0.8476, + "step": 9344 + }, + { + "epoch": 4.2361740707162285, + "grad_norm": 0.10755056844639473, + "learning_rate": 5.5536409335176725e-06, + "loss": 0.8655, + "step": 9345 + }, + { + "epoch": 4.236627379873074, + "grad_norm": 0.11979586333788579, + "learning_rate": 5.547207748522887e-06, + "loss": 0.8599, + "step": 9346 + }, + { + "epoch": 4.237080689029918, + "grad_norm": 0.10024369890681302, + "learning_rate": 5.540778014076509e-06, + "loss": 0.8551, + "step": 9347 + }, + { + "epoch": 4.2375339981867635, + "grad_norm": 0.11774862609345126, + "learning_rate": 5.534351730822476e-06, + "loss": 0.8518, + "step": 9348 + }, + { + "epoch": 4.237987307343609, + "grad_norm": 0.11070042723575589, + "learning_rate": 5.52792889940442e-06, + "loss": 0.8602, + "step": 9349 + }, + { + "epoch": 4.238440616500453, + "grad_norm": 0.11020968108368566, + "learning_rate": 5.5215095204656e-06, + "loss": 0.8607, + "step": 9350 + }, + { + "epoch": 4.238893925657298, + "grad_norm": 0.11644236195485354, + "learning_rate": 5.515093594648928e-06, + "loss": 0.8611, + "step": 9351 + }, + { + "epoch": 4.239347234814144, + "grad_norm": 0.11243270734098021, + "learning_rate": 5.50868112259697e-06, + "loss": 0.8658, + "step": 9352 + }, + { + "epoch": 4.239800543970988, + "grad_norm": 0.11156721497527133, + "learning_rate": 5.502272104951982e-06, + "loss": 0.8658, + "step": 9353 + }, + { + "epoch": 4.240253853127833, + "grad_norm": 0.10587751721247415, + "learning_rate": 5.495866542355814e-06, + "loss": 0.8478, + "step": 9354 + }, + { + "epoch": 4.240707162284679, + "grad_norm": 0.11429100450438912, + "learning_rate": 5.489464435450029e-06, + "loss": 0.8499, + "step": 9355 + }, + { + "epoch": 4.241160471441523, + "grad_norm": 0.09881741423187988, + "learning_rate": 5.483065784875807e-06, + "loss": 0.8531, + "step": 9356 + }, + { + "epoch": 4.241613780598368, + "grad_norm": 0.09602830238766888, + "learning_rate": 5.476670591273978e-06, + "loss": 0.8458, + "step": 9357 + }, + { + "epoch": 4.242067089755213, + "grad_norm": 0.12092474678403503, + "learning_rate": 5.470278855285065e-06, + "loss": 0.8684, + "step": 9358 + }, + { + "epoch": 4.242520398912058, + "grad_norm": 0.10648962652748893, + "learning_rate": 5.463890577549191e-06, + "loss": 0.8705, + "step": 9359 + }, + { + "epoch": 4.242973708068903, + "grad_norm": 0.12363095031683162, + "learning_rate": 5.457505758706187e-06, + "loss": 0.8502, + "step": 9360 + }, + { + "epoch": 4.2434270172257476, + "grad_norm": 0.1132577407585533, + "learning_rate": 5.4511243993954935e-06, + "loss": 0.8603, + "step": 9361 + }, + { + "epoch": 4.243880326382593, + "grad_norm": 0.10510825519182798, + "learning_rate": 5.444746500256237e-06, + "loss": 0.8702, + "step": 9362 + }, + { + "epoch": 4.244333635539438, + "grad_norm": 0.12852186152621659, + "learning_rate": 5.438372061927162e-06, + "loss": 0.843, + "step": 9363 + }, + { + "epoch": 4.2447869446962825, + "grad_norm": 0.11384806556870954, + "learning_rate": 5.432001085046712e-06, + "loss": 0.8346, + "step": 9364 + }, + { + "epoch": 4.245240253853128, + "grad_norm": 0.1401611182104351, + "learning_rate": 5.425633570252933e-06, + "loss": 0.8493, + "step": 9365 + }, + { + "epoch": 4.245693563009973, + "grad_norm": 0.11122035975397418, + "learning_rate": 5.419269518183576e-06, + "loss": 0.8444, + "step": 9366 + }, + { + "epoch": 4.246146872166817, + "grad_norm": 0.12132013555667824, + "learning_rate": 5.412908929476009e-06, + "loss": 0.8533, + "step": 9367 + }, + { + "epoch": 4.246600181323663, + "grad_norm": 0.10437153800280212, + "learning_rate": 5.406551804767253e-06, + "loss": 0.8466, + "step": 9368 + }, + { + "epoch": 4.247053490480508, + "grad_norm": 0.09962388775745974, + "learning_rate": 5.4001981446940084e-06, + "loss": 0.8501, + "step": 9369 + }, + { + "epoch": 4.247506799637352, + "grad_norm": 0.12358672726098141, + "learning_rate": 5.393847949892612e-06, + "loss": 0.8647, + "step": 9370 + }, + { + "epoch": 4.247960108794198, + "grad_norm": 0.10868758105750338, + "learning_rate": 5.387501220999043e-06, + "loss": 0.8585, + "step": 9371 + }, + { + "epoch": 4.248413417951043, + "grad_norm": 0.11182190142700867, + "learning_rate": 5.381157958648948e-06, + "loss": 0.8324, + "step": 9372 + }, + { + "epoch": 4.248866727107887, + "grad_norm": 0.12773445089906132, + "learning_rate": 5.374818163477629e-06, + "loss": 0.8726, + "step": 9373 + }, + { + "epoch": 4.2493200362647325, + "grad_norm": 0.12610877268183998, + "learning_rate": 5.3684818361200255e-06, + "loss": 0.8326, + "step": 9374 + }, + { + "epoch": 4.249773345421578, + "grad_norm": 0.10544323773546682, + "learning_rate": 5.3621489772107596e-06, + "loss": 0.8403, + "step": 9375 + }, + { + "epoch": 4.250226654578422, + "grad_norm": 0.1275797042836495, + "learning_rate": 5.355819587384057e-06, + "loss": 0.8435, + "step": 9376 + }, + { + "epoch": 4.2506799637352675, + "grad_norm": 0.12291237752466316, + "learning_rate": 5.349493667273851e-06, + "loss": 0.8531, + "step": 9377 + }, + { + "epoch": 4.251133272892113, + "grad_norm": 0.10621663733203143, + "learning_rate": 5.343171217513683e-06, + "loss": 0.8445, + "step": 9378 + }, + { + "epoch": 4.251586582048957, + "grad_norm": 0.1294563062386802, + "learning_rate": 5.336852238736781e-06, + "loss": 0.8743, + "step": 9379 + }, + { + "epoch": 4.252039891205802, + "grad_norm": 0.12107958258075914, + "learning_rate": 5.330536731575993e-06, + "loss": 0.8446, + "step": 9380 + }, + { + "epoch": 4.252493200362648, + "grad_norm": 0.10702216412940735, + "learning_rate": 5.324224696663849e-06, + "loss": 0.8362, + "step": 9381 + }, + { + "epoch": 4.252946509519492, + "grad_norm": 0.11003674806614124, + "learning_rate": 5.317916134632511e-06, + "loss": 0.8727, + "step": 9382 + }, + { + "epoch": 4.253399818676337, + "grad_norm": 0.12097559631439221, + "learning_rate": 5.311611046113792e-06, + "loss": 0.8443, + "step": 9383 + }, + { + "epoch": 4.253853127833183, + "grad_norm": 0.1318708855911713, + "learning_rate": 5.305309431739188e-06, + "loss": 0.8728, + "step": 9384 + }, + { + "epoch": 4.254306436990027, + "grad_norm": 0.11794700899462372, + "learning_rate": 5.299011292139793e-06, + "loss": 0.8576, + "step": 9385 + }, + { + "epoch": 4.254759746146872, + "grad_norm": 0.1153167315172711, + "learning_rate": 5.292716627946415e-06, + "loss": 0.8794, + "step": 9386 + }, + { + "epoch": 4.2552130553037175, + "grad_norm": 0.14628127259254367, + "learning_rate": 5.286425439789465e-06, + "loss": 0.8506, + "step": 9387 + }, + { + "epoch": 4.255666364460562, + "grad_norm": 0.10080949378266184, + "learning_rate": 5.280137728299028e-06, + "loss": 0.8475, + "step": 9388 + }, + { + "epoch": 4.256119673617407, + "grad_norm": 0.12787132369097434, + "learning_rate": 5.273853494104844e-06, + "loss": 0.8464, + "step": 9389 + }, + { + "epoch": 4.2565729827742524, + "grad_norm": 0.1266338485295343, + "learning_rate": 5.267572737836291e-06, + "loss": 0.8389, + "step": 9390 + }, + { + "epoch": 4.257026291931097, + "grad_norm": 0.11536559437061109, + "learning_rate": 5.261295460122395e-06, + "loss": 0.8584, + "step": 9391 + }, + { + "epoch": 4.257479601087942, + "grad_norm": 0.10459793484821504, + "learning_rate": 5.255021661591859e-06, + "loss": 0.8651, + "step": 9392 + }, + { + "epoch": 4.2579329102447865, + "grad_norm": 0.12975402407813713, + "learning_rate": 5.248751342873011e-06, + "loss": 0.8578, + "step": 9393 + }, + { + "epoch": 4.258386219401632, + "grad_norm": 0.11876545432276511, + "learning_rate": 5.242484504593859e-06, + "loss": 0.8441, + "step": 9394 + }, + { + "epoch": 4.258839528558477, + "grad_norm": 0.11108062706713702, + "learning_rate": 5.236221147382021e-06, + "loss": 0.8592, + "step": 9395 + }, + { + "epoch": 4.259292837715321, + "grad_norm": 0.11861158705986341, + "learning_rate": 5.229961271864819e-06, + "loss": 0.8549, + "step": 9396 + }, + { + "epoch": 4.259746146872167, + "grad_norm": 0.13449783675737628, + "learning_rate": 5.223704878669176e-06, + "loss": 0.8717, + "step": 9397 + }, + { + "epoch": 4.260199456029012, + "grad_norm": 0.11539158073330087, + "learning_rate": 5.217451968421689e-06, + "loss": 0.8599, + "step": 9398 + }, + { + "epoch": 4.260652765185856, + "grad_norm": 0.10248729798643265, + "learning_rate": 5.211202541748624e-06, + "loss": 0.8769, + "step": 9399 + }, + { + "epoch": 4.261106074342702, + "grad_norm": 0.12015860520327405, + "learning_rate": 5.204956599275854e-06, + "loss": 0.8522, + "step": 9400 + }, + { + "epoch": 4.261559383499547, + "grad_norm": 0.11825721636228244, + "learning_rate": 5.1987141416289485e-06, + "loss": 0.8627, + "step": 9401 + }, + { + "epoch": 4.262012692656391, + "grad_norm": 0.12952095633049465, + "learning_rate": 5.1924751694330954e-06, + "loss": 0.8454, + "step": 9402 + }, + { + "epoch": 4.2624660018132365, + "grad_norm": 0.11656610425335003, + "learning_rate": 5.186239683313159e-06, + "loss": 0.8412, + "step": 9403 + }, + { + "epoch": 4.262919310970082, + "grad_norm": 0.12157053783420896, + "learning_rate": 5.180007683893626e-06, + "loss": 0.8392, + "step": 9404 + }, + { + "epoch": 4.263372620126926, + "grad_norm": 0.11870763759007198, + "learning_rate": 5.173779171798665e-06, + "loss": 0.8356, + "step": 9405 + }, + { + "epoch": 4.2638259292837715, + "grad_norm": 0.13273521144816047, + "learning_rate": 5.1675541476520655e-06, + "loss": 0.8674, + "step": 9406 + }, + { + "epoch": 4.264279238440617, + "grad_norm": 0.11846810540247066, + "learning_rate": 5.161332612077297e-06, + "loss": 0.8804, + "step": 9407 + }, + { + "epoch": 4.264732547597461, + "grad_norm": 0.13138106084997933, + "learning_rate": 5.1551145656974564e-06, + "loss": 0.8574, + "step": 9408 + }, + { + "epoch": 4.265185856754306, + "grad_norm": 0.1413546006972058, + "learning_rate": 5.1489000091353e-06, + "loss": 0.8551, + "step": 9409 + }, + { + "epoch": 4.265639165911152, + "grad_norm": 0.1064375540120899, + "learning_rate": 5.142688943013223e-06, + "loss": 0.8509, + "step": 9410 + }, + { + "epoch": 4.266092475067996, + "grad_norm": 0.12306216657933157, + "learning_rate": 5.1364813679533006e-06, + "loss": 0.8499, + "step": 9411 + }, + { + "epoch": 4.266545784224841, + "grad_norm": 0.12350129562299854, + "learning_rate": 5.130277284577223e-06, + "loss": 0.8702, + "step": 9412 + }, + { + "epoch": 4.266999093381687, + "grad_norm": 0.12544466411355604, + "learning_rate": 5.124076693506364e-06, + "loss": 0.8796, + "step": 9413 + }, + { + "epoch": 4.267452402538531, + "grad_norm": 0.1264511763387126, + "learning_rate": 5.117879595361723e-06, + "loss": 0.8562, + "step": 9414 + }, + { + "epoch": 4.267905711695376, + "grad_norm": 0.1129503665919243, + "learning_rate": 5.111685990763948e-06, + "loss": 0.8604, + "step": 9415 + }, + { + "epoch": 4.2683590208522215, + "grad_norm": 0.11690213625384595, + "learning_rate": 5.105495880333364e-06, + "loss": 0.8597, + "step": 9416 + }, + { + "epoch": 4.268812330009066, + "grad_norm": 0.11680984826497783, + "learning_rate": 5.099309264689911e-06, + "loss": 0.8594, + "step": 9417 + }, + { + "epoch": 4.269265639165911, + "grad_norm": 0.11694535576188944, + "learning_rate": 5.093126144453217e-06, + "loss": 0.8475, + "step": 9418 + }, + { + "epoch": 4.2697189483227564, + "grad_norm": 0.1227345302167215, + "learning_rate": 5.086946520242521e-06, + "loss": 0.8752, + "step": 9419 + }, + { + "epoch": 4.270172257479601, + "grad_norm": 0.10333917423686452, + "learning_rate": 5.080770392676746e-06, + "loss": 0.8527, + "step": 9420 + }, + { + "epoch": 4.270625566636446, + "grad_norm": 0.11945181247336459, + "learning_rate": 5.074597762374436e-06, + "loss": 0.8635, + "step": 9421 + }, + { + "epoch": 4.271078875793291, + "grad_norm": 0.10751550453680017, + "learning_rate": 5.068428629953807e-06, + "loss": 0.8476, + "step": 9422 + }, + { + "epoch": 4.271532184950136, + "grad_norm": 0.12189492433701817, + "learning_rate": 5.062262996032718e-06, + "loss": 0.8653, + "step": 9423 + }, + { + "epoch": 4.271985494106981, + "grad_norm": 0.12010508161147208, + "learning_rate": 5.056100861228657e-06, + "loss": 0.8689, + "step": 9424 + }, + { + "epoch": 4.272438803263826, + "grad_norm": 0.10491837628791943, + "learning_rate": 5.049942226158804e-06, + "loss": 0.8592, + "step": 9425 + }, + { + "epoch": 4.272892112420671, + "grad_norm": 0.12073533912330082, + "learning_rate": 5.043787091439947e-06, + "loss": 0.8655, + "step": 9426 + }, + { + "epoch": 4.273345421577516, + "grad_norm": 0.11986440361180911, + "learning_rate": 5.037635457688552e-06, + "loss": 0.886, + "step": 9427 + }, + { + "epoch": 4.273798730734361, + "grad_norm": 0.11633846186557256, + "learning_rate": 5.031487325520718e-06, + "loss": 0.8288, + "step": 9428 + }, + { + "epoch": 4.274252039891206, + "grad_norm": 0.12118982925506452, + "learning_rate": 5.025342695552202e-06, + "loss": 0.8566, + "step": 9429 + }, + { + "epoch": 4.274705349048051, + "grad_norm": 0.11625845310167425, + "learning_rate": 5.019201568398395e-06, + "loss": 0.8387, + "step": 9430 + }, + { + "epoch": 4.275158658204896, + "grad_norm": 0.11102961315599558, + "learning_rate": 5.013063944674365e-06, + "loss": 0.862, + "step": 9431 + }, + { + "epoch": 4.2756119673617405, + "grad_norm": 0.10754574765791569, + "learning_rate": 5.006929824994795e-06, + "loss": 0.8571, + "step": 9432 + }, + { + "epoch": 4.276065276518586, + "grad_norm": 0.11966213783000947, + "learning_rate": 5.000799209974059e-06, + "loss": 0.8351, + "step": 9433 + }, + { + "epoch": 4.276518585675431, + "grad_norm": 0.13116370136713046, + "learning_rate": 4.994672100226132e-06, + "loss": 0.8621, + "step": 9434 + }, + { + "epoch": 4.2769718948322755, + "grad_norm": 0.11798105650787924, + "learning_rate": 4.98854849636468e-06, + "loss": 0.8479, + "step": 9435 + }, + { + "epoch": 4.277425203989121, + "grad_norm": 0.12818982821870487, + "learning_rate": 4.982428399002985e-06, + "loss": 0.8583, + "step": 9436 + }, + { + "epoch": 4.277878513145966, + "grad_norm": 0.10292871168722763, + "learning_rate": 4.976311808754011e-06, + "loss": 0.8528, + "step": 9437 + }, + { + "epoch": 4.27833182230281, + "grad_norm": 0.10949013788284272, + "learning_rate": 4.970198726230333e-06, + "loss": 0.8697, + "step": 9438 + }, + { + "epoch": 4.278785131459656, + "grad_norm": 0.11127892922284129, + "learning_rate": 4.964089152044209e-06, + "loss": 0.8673, + "step": 9439 + }, + { + "epoch": 4.2792384406165, + "grad_norm": 0.11175759097301544, + "learning_rate": 4.957983086807527e-06, + "loss": 0.8587, + "step": 9440 + }, + { + "epoch": 4.279691749773345, + "grad_norm": 0.10566471011532225, + "learning_rate": 4.951880531131817e-06, + "loss": 0.8599, + "step": 9441 + }, + { + "epoch": 4.280145058930191, + "grad_norm": 0.10621058979691513, + "learning_rate": 4.945781485628285e-06, + "loss": 0.8401, + "step": 9442 + }, + { + "epoch": 4.280598368087035, + "grad_norm": 0.10996372232889634, + "learning_rate": 4.939685950907751e-06, + "loss": 0.8549, + "step": 9443 + }, + { + "epoch": 4.28105167724388, + "grad_norm": 0.11668648398081727, + "learning_rate": 4.9335939275807175e-06, + "loss": 0.8367, + "step": 9444 + }, + { + "epoch": 4.2815049864007255, + "grad_norm": 0.11457019608205951, + "learning_rate": 4.927505416257301e-06, + "loss": 0.8409, + "step": 9445 + }, + { + "epoch": 4.28195829555757, + "grad_norm": 0.0973059075421198, + "learning_rate": 4.921420417547311e-06, + "loss": 0.8313, + "step": 9446 + }, + { + "epoch": 4.282411604714415, + "grad_norm": 0.13485336858720948, + "learning_rate": 4.915338932060146e-06, + "loss": 0.8675, + "step": 9447 + }, + { + "epoch": 4.2828649138712604, + "grad_norm": 0.10635642479184554, + "learning_rate": 4.9092609604049025e-06, + "loss": 0.8635, + "step": 9448 + }, + { + "epoch": 4.283318223028105, + "grad_norm": 0.10515579458813351, + "learning_rate": 4.9031865031903e-06, + "loss": 0.8533, + "step": 9449 + }, + { + "epoch": 4.28377153218495, + "grad_norm": 0.11355504414130674, + "learning_rate": 4.897115561024723e-06, + "loss": 0.8709, + "step": 9450 + }, + { + "epoch": 4.284224841341795, + "grad_norm": 0.10694993288052874, + "learning_rate": 4.891048134516178e-06, + "loss": 0.8766, + "step": 9451 + }, + { + "epoch": 4.28467815049864, + "grad_norm": 0.09491488477710823, + "learning_rate": 4.884984224272358e-06, + "loss": 0.8406, + "step": 9452 + }, + { + "epoch": 4.285131459655485, + "grad_norm": 0.10819350652950945, + "learning_rate": 4.878923830900557e-06, + "loss": 0.8476, + "step": 9453 + }, + { + "epoch": 4.28558476881233, + "grad_norm": 0.09942948889551464, + "learning_rate": 4.872866955007762e-06, + "loss": 0.8591, + "step": 9454 + }, + { + "epoch": 4.286038077969175, + "grad_norm": 0.12692701607531626, + "learning_rate": 4.866813597200577e-06, + "loss": 0.8552, + "step": 9455 + }, + { + "epoch": 4.28649138712602, + "grad_norm": 0.08936252426155736, + "learning_rate": 4.860763758085258e-06, + "loss": 0.8507, + "step": 9456 + }, + { + "epoch": 4.286944696282865, + "grad_norm": 0.10392482731186634, + "learning_rate": 4.854717438267731e-06, + "loss": 0.8568, + "step": 9457 + }, + { + "epoch": 4.28739800543971, + "grad_norm": 0.11973129461783441, + "learning_rate": 4.8486746383535324e-06, + "loss": 0.8404, + "step": 9458 + }, + { + "epoch": 4.287851314596555, + "grad_norm": 0.10397866851509746, + "learning_rate": 4.842635358947889e-06, + "loss": 0.8429, + "step": 9459 + }, + { + "epoch": 4.2883046237534, + "grad_norm": 0.09918074967219842, + "learning_rate": 4.8365996006556245e-06, + "loss": 0.8427, + "step": 9460 + }, + { + "epoch": 4.2887579329102445, + "grad_norm": 0.11397775018516902, + "learning_rate": 4.830567364081269e-06, + "loss": 0.8566, + "step": 9461 + }, + { + "epoch": 4.28921124206709, + "grad_norm": 0.11430099408649848, + "learning_rate": 4.824538649828938e-06, + "loss": 0.8483, + "step": 9462 + }, + { + "epoch": 4.289664551223935, + "grad_norm": 0.11077762632305764, + "learning_rate": 4.8185134585024515e-06, + "loss": 0.8495, + "step": 9463 + }, + { + "epoch": 4.2901178603807795, + "grad_norm": 0.09687646417828884, + "learning_rate": 4.81249179070523e-06, + "loss": 0.8342, + "step": 9464 + }, + { + "epoch": 4.290571169537625, + "grad_norm": 0.11638021972546764, + "learning_rate": 4.806473647040379e-06, + "loss": 0.8694, + "step": 9465 + }, + { + "epoch": 4.29102447869447, + "grad_norm": 0.10666850962698245, + "learning_rate": 4.800459028110624e-06, + "loss": 0.8713, + "step": 9466 + }, + { + "epoch": 4.291477787851314, + "grad_norm": 0.1101019335943472, + "learning_rate": 4.794447934518345e-06, + "loss": 0.8616, + "step": 9467 + }, + { + "epoch": 4.29193109700816, + "grad_norm": 0.09973690093261338, + "learning_rate": 4.788440366865565e-06, + "loss": 0.8615, + "step": 9468 + }, + { + "epoch": 4.292384406165005, + "grad_norm": 0.11819783357109524, + "learning_rate": 4.7824363257539784e-06, + "loss": 0.8487, + "step": 9469 + }, + { + "epoch": 4.292837715321849, + "grad_norm": 0.11135519599816826, + "learning_rate": 4.776435811784894e-06, + "loss": 0.8618, + "step": 9470 + }, + { + "epoch": 4.293291024478695, + "grad_norm": 0.10157661324596154, + "learning_rate": 4.770438825559276e-06, + "loss": 0.8599, + "step": 9471 + }, + { + "epoch": 4.293744333635539, + "grad_norm": 0.10831289783796698, + "learning_rate": 4.764445367677755e-06, + "loss": 0.8469, + "step": 9472 + }, + { + "epoch": 4.294197642792384, + "grad_norm": 0.08885808133020377, + "learning_rate": 4.758455438740574e-06, + "loss": 0.8684, + "step": 9473 + }, + { + "epoch": 4.2946509519492295, + "grad_norm": 0.09312774153495476, + "learning_rate": 4.7524690393476645e-06, + "loss": 0.8761, + "step": 9474 + }, + { + "epoch": 4.295104261106074, + "grad_norm": 0.10203992937574392, + "learning_rate": 4.746486170098563e-06, + "loss": 0.8604, + "step": 9475 + }, + { + "epoch": 4.295557570262919, + "grad_norm": 0.10640736984244542, + "learning_rate": 4.740506831592488e-06, + "loss": 0.8524, + "step": 9476 + }, + { + "epoch": 4.2960108794197644, + "grad_norm": 0.0940251229458805, + "learning_rate": 4.734531024428273e-06, + "loss": 0.8415, + "step": 9477 + }, + { + "epoch": 4.296464188576609, + "grad_norm": 0.11216286505999415, + "learning_rate": 4.728558749204428e-06, + "loss": 0.8751, + "step": 9478 + }, + { + "epoch": 4.296917497733454, + "grad_norm": 0.10894941918369268, + "learning_rate": 4.722590006519072e-06, + "loss": 0.8471, + "step": 9479 + }, + { + "epoch": 4.297370806890299, + "grad_norm": 0.10362002590229168, + "learning_rate": 4.71662479697002e-06, + "loss": 0.841, + "step": 9480 + }, + { + "epoch": 4.297824116047144, + "grad_norm": 0.10769408541525174, + "learning_rate": 4.710663121154686e-06, + "loss": 0.8557, + "step": 9481 + }, + { + "epoch": 4.298277425203989, + "grad_norm": 0.10113876077469908, + "learning_rate": 4.704704979670145e-06, + "loss": 0.8745, + "step": 9482 + }, + { + "epoch": 4.298730734360834, + "grad_norm": 0.13468151126631867, + "learning_rate": 4.698750373113141e-06, + "loss": 0.8712, + "step": 9483 + }, + { + "epoch": 4.299184043517679, + "grad_norm": 0.09716229909622653, + "learning_rate": 4.692799302080029e-06, + "loss": 0.8283, + "step": 9484 + }, + { + "epoch": 4.299637352674524, + "grad_norm": 0.09538102293517213, + "learning_rate": 4.686851767166847e-06, + "loss": 0.8667, + "step": 9485 + }, + { + "epoch": 4.300090661831369, + "grad_norm": 0.10450673769975234, + "learning_rate": 4.68090776896923e-06, + "loss": 0.8852, + "step": 9486 + }, + { + "epoch": 4.300543970988214, + "grad_norm": 0.09646254338253286, + "learning_rate": 4.67496730808251e-06, + "loss": 0.8477, + "step": 9487 + }, + { + "epoch": 4.300997280145059, + "grad_norm": 0.09143224224700235, + "learning_rate": 4.66903038510162e-06, + "loss": 0.8576, + "step": 9488 + }, + { + "epoch": 4.301450589301904, + "grad_norm": 0.10729702475695306, + "learning_rate": 4.663097000621184e-06, + "loss": 0.8367, + "step": 9489 + }, + { + "epoch": 4.3019038984587485, + "grad_norm": 0.11027607683346, + "learning_rate": 4.657167155235427e-06, + "loss": 0.8625, + "step": 9490 + }, + { + "epoch": 4.302357207615594, + "grad_norm": 0.11041019796855304, + "learning_rate": 4.651240849538256e-06, + "loss": 0.8652, + "step": 9491 + }, + { + "epoch": 4.302810516772439, + "grad_norm": 0.18008272645554746, + "learning_rate": 4.645318084123199e-06, + "loss": 0.8717, + "step": 9492 + }, + { + "epoch": 4.3032638259292835, + "grad_norm": 0.11757392896138213, + "learning_rate": 4.639398859583444e-06, + "loss": 0.8303, + "step": 9493 + }, + { + "epoch": 4.303717135086129, + "grad_norm": 0.10313646805453407, + "learning_rate": 4.633483176511808e-06, + "loss": 0.8405, + "step": 9494 + }, + { + "epoch": 4.304170444242974, + "grad_norm": 0.13187879173390848, + "learning_rate": 4.627571035500787e-06, + "loss": 0.8708, + "step": 9495 + }, + { + "epoch": 4.304623753399818, + "grad_norm": 0.129388299109953, + "learning_rate": 4.621662437142478e-06, + "loss": 0.8642, + "step": 9496 + }, + { + "epoch": 4.305077062556664, + "grad_norm": 0.13024655901527696, + "learning_rate": 4.615757382028645e-06, + "loss": 0.861, + "step": 9497 + }, + { + "epoch": 4.305530371713509, + "grad_norm": 0.11290096095543842, + "learning_rate": 4.6098558707507125e-06, + "loss": 0.8596, + "step": 9498 + }, + { + "epoch": 4.305983680870353, + "grad_norm": 0.14015179685155119, + "learning_rate": 4.603957903899714e-06, + "loss": 0.8735, + "step": 9499 + }, + { + "epoch": 4.306436990027199, + "grad_norm": 0.12664809981866856, + "learning_rate": 4.598063482066368e-06, + "loss": 0.8488, + "step": 9500 + }, + { + "epoch": 4.306890299184044, + "grad_norm": 0.10697018697240818, + "learning_rate": 4.592172605840999e-06, + "loss": 0.8395, + "step": 9501 + }, + { + "epoch": 4.307343608340888, + "grad_norm": 0.11929995023283059, + "learning_rate": 4.586285275813614e-06, + "loss": 0.8385, + "step": 9502 + }, + { + "epoch": 4.3077969174977335, + "grad_norm": 0.1206629874936286, + "learning_rate": 4.580401492573829e-06, + "loss": 0.8397, + "step": 9503 + }, + { + "epoch": 4.308250226654579, + "grad_norm": 0.13012869207594482, + "learning_rate": 4.574521256710949e-06, + "loss": 0.8485, + "step": 9504 + }, + { + "epoch": 4.308703535811423, + "grad_norm": 0.1187724718721318, + "learning_rate": 4.56864456881386e-06, + "loss": 0.8565, + "step": 9505 + }, + { + "epoch": 4.3091568449682685, + "grad_norm": 0.10201634453835805, + "learning_rate": 4.5627714294711555e-06, + "loss": 0.8593, + "step": 9506 + }, + { + "epoch": 4.309610154125114, + "grad_norm": 0.1209323165969495, + "learning_rate": 4.556901839271035e-06, + "loss": 0.8555, + "step": 9507 + }, + { + "epoch": 4.310063463281958, + "grad_norm": 0.11583291366414358, + "learning_rate": 4.551035798801367e-06, + "loss": 0.8564, + "step": 9508 + }, + { + "epoch": 4.310516772438803, + "grad_norm": 0.10084490061750324, + "learning_rate": 4.54517330864964e-06, + "loss": 0.8466, + "step": 9509 + }, + { + "epoch": 4.310970081595649, + "grad_norm": 0.12083796895995842, + "learning_rate": 4.539314369403011e-06, + "loss": 0.868, + "step": 9510 + }, + { + "epoch": 4.311423390752493, + "grad_norm": 0.10635355391449064, + "learning_rate": 4.5334589816482666e-06, + "loss": 0.8592, + "step": 9511 + }, + { + "epoch": 4.311876699909338, + "grad_norm": 0.10719638644214406, + "learning_rate": 4.527607145971833e-06, + "loss": 0.8395, + "step": 9512 + }, + { + "epoch": 4.312330009066184, + "grad_norm": 0.11952298744750013, + "learning_rate": 4.5217588629598015e-06, + "loss": 0.8443, + "step": 9513 + }, + { + "epoch": 4.312783318223028, + "grad_norm": 0.10978281508174274, + "learning_rate": 4.515914133197883e-06, + "loss": 0.8416, + "step": 9514 + }, + { + "epoch": 4.313236627379873, + "grad_norm": 0.1379165973711206, + "learning_rate": 4.5100729572714566e-06, + "loss": 0.8446, + "step": 9515 + }, + { + "epoch": 4.3136899365367185, + "grad_norm": 0.11718628974891973, + "learning_rate": 4.504235335765517e-06, + "loss": 0.8565, + "step": 9516 + }, + { + "epoch": 4.314143245693563, + "grad_norm": 0.1286911024183255, + "learning_rate": 4.498401269264734e-06, + "loss": 0.865, + "step": 9517 + }, + { + "epoch": 4.314596554850408, + "grad_norm": 0.12531890846946842, + "learning_rate": 4.492570758353401e-06, + "loss": 0.8595, + "step": 9518 + }, + { + "epoch": 4.3150498640072525, + "grad_norm": 0.1350955084839858, + "learning_rate": 4.486743803615463e-06, + "loss": 0.852, + "step": 9519 + }, + { + "epoch": 4.315503173164098, + "grad_norm": 0.11803466838808419, + "learning_rate": 4.480920405634499e-06, + "loss": 0.8547, + "step": 9520 + }, + { + "epoch": 4.315956482320943, + "grad_norm": 0.13201918682227295, + "learning_rate": 4.475100564993749e-06, + "loss": 0.8714, + "step": 9521 + }, + { + "epoch": 4.3164097914777875, + "grad_norm": 0.1351907031901681, + "learning_rate": 4.469284282276087e-06, + "loss": 0.85, + "step": 9522 + }, + { + "epoch": 4.316863100634633, + "grad_norm": 0.10303937708593833, + "learning_rate": 4.463471558064027e-06, + "loss": 0.8653, + "step": 9523 + }, + { + "epoch": 4.317316409791478, + "grad_norm": 0.10580180261451035, + "learning_rate": 4.457662392939721e-06, + "loss": 0.8465, + "step": 9524 + }, + { + "epoch": 4.317769718948322, + "grad_norm": 0.11928699503020132, + "learning_rate": 4.451856787484991e-06, + "loss": 0.8606, + "step": 9525 + }, + { + "epoch": 4.318223028105168, + "grad_norm": 0.09684698896068986, + "learning_rate": 4.44605474228128e-06, + "loss": 0.8543, + "step": 9526 + }, + { + "epoch": 4.318676337262013, + "grad_norm": 0.10387017638729303, + "learning_rate": 4.44025625790967e-06, + "loss": 0.8628, + "step": 9527 + }, + { + "epoch": 4.319129646418857, + "grad_norm": 0.12343032104651704, + "learning_rate": 4.434461334950908e-06, + "loss": 0.8507, + "step": 9528 + }, + { + "epoch": 4.319582955575703, + "grad_norm": 0.10057760162913464, + "learning_rate": 4.428669973985363e-06, + "loss": 0.848, + "step": 9529 + }, + { + "epoch": 4.320036264732548, + "grad_norm": 0.10193084768628718, + "learning_rate": 4.42288217559307e-06, + "loss": 0.8686, + "step": 9530 + }, + { + "epoch": 4.320489573889392, + "grad_norm": 0.11610512973770121, + "learning_rate": 4.4170979403536805e-06, + "loss": 0.8617, + "step": 9531 + }, + { + "epoch": 4.3209428830462375, + "grad_norm": 0.1011754098509756, + "learning_rate": 4.411317268846511e-06, + "loss": 0.826, + "step": 9532 + }, + { + "epoch": 4.321396192203083, + "grad_norm": 0.11062080047292835, + "learning_rate": 4.405540161650508e-06, + "loss": 0.8413, + "step": 9533 + }, + { + "epoch": 4.321849501359927, + "grad_norm": 0.11288024863839327, + "learning_rate": 4.399766619344275e-06, + "loss": 0.8557, + "step": 9534 + }, + { + "epoch": 4.3223028105167725, + "grad_norm": 0.10392984135756181, + "learning_rate": 4.393996642506033e-06, + "loss": 0.8418, + "step": 9535 + }, + { + "epoch": 4.322756119673618, + "grad_norm": 0.10221004906965721, + "learning_rate": 4.388230231713677e-06, + "loss": 0.8365, + "step": 9536 + }, + { + "epoch": 4.323209428830462, + "grad_norm": 0.1184698123799368, + "learning_rate": 4.382467387544726e-06, + "loss": 0.8508, + "step": 9537 + }, + { + "epoch": 4.323662737987307, + "grad_norm": 0.11139404145218706, + "learning_rate": 4.376708110576337e-06, + "loss": 0.8283, + "step": 9538 + }, + { + "epoch": 4.324116047144153, + "grad_norm": 0.1293280305129736, + "learning_rate": 4.370952401385334e-06, + "loss": 0.8463, + "step": 9539 + }, + { + "epoch": 4.324569356300997, + "grad_norm": 0.13130020864885236, + "learning_rate": 4.36520026054815e-06, + "loss": 0.8483, + "step": 9540 + }, + { + "epoch": 4.325022665457842, + "grad_norm": 0.10265416736527337, + "learning_rate": 4.359451688640901e-06, + "loss": 0.8491, + "step": 9541 + }, + { + "epoch": 4.325475974614688, + "grad_norm": 0.13911198165887353, + "learning_rate": 4.353706686239307e-06, + "loss": 0.8458, + "step": 9542 + }, + { + "epoch": 4.325929283771532, + "grad_norm": 0.13972188142753109, + "learning_rate": 4.347965253918749e-06, + "loss": 0.8518, + "step": 9543 + }, + { + "epoch": 4.326382592928377, + "grad_norm": 0.11062773221908778, + "learning_rate": 4.342227392254245e-06, + "loss": 0.8645, + "step": 9544 + }, + { + "epoch": 4.3268359020852225, + "grad_norm": 0.13825099088677043, + "learning_rate": 4.336493101820476e-06, + "loss": 0.85, + "step": 9545 + }, + { + "epoch": 4.327289211242067, + "grad_norm": 0.1152814828233211, + "learning_rate": 4.330762383191722e-06, + "loss": 0.8508, + "step": 9546 + }, + { + "epoch": 4.327742520398912, + "grad_norm": 0.1102453421761806, + "learning_rate": 4.325035236941957e-06, + "loss": 0.8524, + "step": 9547 + }, + { + "epoch": 4.328195829555757, + "grad_norm": 0.12125070409473931, + "learning_rate": 4.319311663644747e-06, + "loss": 0.8595, + "step": 9548 + }, + { + "epoch": 4.328649138712602, + "grad_norm": 0.11944307738516023, + "learning_rate": 4.313591663873351e-06, + "loss": 0.8603, + "step": 9549 + }, + { + "epoch": 4.329102447869447, + "grad_norm": 0.10231059499214536, + "learning_rate": 4.307875238200616e-06, + "loss": 0.8539, + "step": 9550 + }, + { + "epoch": 4.3295557570262915, + "grad_norm": 0.09899868849238445, + "learning_rate": 4.302162387199089e-06, + "loss": 0.8578, + "step": 9551 + }, + { + "epoch": 4.330009066183137, + "grad_norm": 0.10592381411424368, + "learning_rate": 4.296453111440908e-06, + "loss": 0.8566, + "step": 9552 + }, + { + "epoch": 4.330462375339982, + "grad_norm": 0.0999442419289302, + "learning_rate": 4.290747411497869e-06, + "loss": 0.8569, + "step": 9553 + }, + { + "epoch": 4.330915684496826, + "grad_norm": 0.10777718098666025, + "learning_rate": 4.285045287941434e-06, + "loss": 0.8589, + "step": 9554 + }, + { + "epoch": 4.331368993653672, + "grad_norm": 0.10130485030246168, + "learning_rate": 4.279346741342671e-06, + "loss": 0.8676, + "step": 9555 + }, + { + "epoch": 4.331822302810517, + "grad_norm": 0.10224806496764145, + "learning_rate": 4.2736517722723205e-06, + "loss": 0.8502, + "step": 9556 + }, + { + "epoch": 4.332275611967361, + "grad_norm": 0.09471942149170741, + "learning_rate": 4.267960381300737e-06, + "loss": 0.8575, + "step": 9557 + }, + { + "epoch": 4.332728921124207, + "grad_norm": 0.10738944572648115, + "learning_rate": 4.262272568997938e-06, + "loss": 0.8548, + "step": 9558 + }, + { + "epoch": 4.333182230281052, + "grad_norm": 0.11145823393522777, + "learning_rate": 4.256588335933569e-06, + "loss": 0.8317, + "step": 9559 + }, + { + "epoch": 4.333635539437896, + "grad_norm": 0.10349457590701949, + "learning_rate": 4.250907682676934e-06, + "loss": 0.8552, + "step": 9560 + }, + { + "epoch": 4.3340888485947415, + "grad_norm": 0.10424759426309722, + "learning_rate": 4.245230609796949e-06, + "loss": 0.8687, + "step": 9561 + }, + { + "epoch": 4.334542157751587, + "grad_norm": 0.11571571072291263, + "learning_rate": 4.239557117862219e-06, + "loss": 0.8526, + "step": 9562 + }, + { + "epoch": 4.334995466908431, + "grad_norm": 0.1014483877695179, + "learning_rate": 4.233887207440926e-06, + "loss": 0.8468, + "step": 9563 + }, + { + "epoch": 4.3354487760652765, + "grad_norm": 0.09928653867961919, + "learning_rate": 4.228220879100948e-06, + "loss": 0.8384, + "step": 9564 + }, + { + "epoch": 4.335902085222122, + "grad_norm": 0.11070834729873168, + "learning_rate": 4.222558133409771e-06, + "loss": 0.8616, + "step": 9565 + }, + { + "epoch": 4.336355394378966, + "grad_norm": 0.11669036920271618, + "learning_rate": 4.216898970934557e-06, + "loss": 0.8524, + "step": 9566 + }, + { + "epoch": 4.336808703535811, + "grad_norm": 0.1029470893360332, + "learning_rate": 4.211243392242072e-06, + "loss": 0.8375, + "step": 9567 + }, + { + "epoch": 4.337262012692657, + "grad_norm": 0.11296557078979254, + "learning_rate": 4.2055913978987384e-06, + "loss": 0.8558, + "step": 9568 + }, + { + "epoch": 4.337715321849501, + "grad_norm": 0.09849117977114359, + "learning_rate": 4.199942988470631e-06, + "loss": 0.8572, + "step": 9569 + }, + { + "epoch": 4.338168631006346, + "grad_norm": 0.10650613882222244, + "learning_rate": 4.1942981645234404e-06, + "loss": 0.8403, + "step": 9570 + }, + { + "epoch": 4.338621940163192, + "grad_norm": 0.09972857620497053, + "learning_rate": 4.188656926622528e-06, + "loss": 0.8647, + "step": 9571 + }, + { + "epoch": 4.339075249320036, + "grad_norm": 0.10398467325695201, + "learning_rate": 4.183019275332862e-06, + "loss": 0.862, + "step": 9572 + }, + { + "epoch": 4.339528558476881, + "grad_norm": 0.10165620157159033, + "learning_rate": 4.1773852112190875e-06, + "loss": 0.8531, + "step": 9573 + }, + { + "epoch": 4.3399818676337265, + "grad_norm": 0.09603270807290708, + "learning_rate": 4.1717547348454566e-06, + "loss": 0.8449, + "step": 9574 + }, + { + "epoch": 4.340435176790571, + "grad_norm": 0.10599935832801761, + "learning_rate": 4.166127846775893e-06, + "loss": 0.8703, + "step": 9575 + }, + { + "epoch": 4.340888485947416, + "grad_norm": 0.10404175868037398, + "learning_rate": 4.160504547573938e-06, + "loss": 0.8571, + "step": 9576 + }, + { + "epoch": 4.341341795104261, + "grad_norm": 0.09893859593761839, + "learning_rate": 4.154884837802784e-06, + "loss": 0.8461, + "step": 9577 + }, + { + "epoch": 4.341795104261106, + "grad_norm": 0.10600443402056736, + "learning_rate": 4.149268718025261e-06, + "loss": 0.8505, + "step": 9578 + }, + { + "epoch": 4.342248413417951, + "grad_norm": 0.10093812164602242, + "learning_rate": 4.143656188803835e-06, + "loss": 0.8509, + "step": 9579 + }, + { + "epoch": 4.342701722574796, + "grad_norm": 0.10688816955129887, + "learning_rate": 4.138047250700629e-06, + "loss": 0.864, + "step": 9580 + }, + { + "epoch": 4.343155031731641, + "grad_norm": 0.10858618024878669, + "learning_rate": 4.132441904277382e-06, + "loss": 0.8471, + "step": 9581 + }, + { + "epoch": 4.343608340888486, + "grad_norm": 0.10059703250552245, + "learning_rate": 4.126840150095488e-06, + "loss": 0.8402, + "step": 9582 + }, + { + "epoch": 4.344061650045331, + "grad_norm": 0.09986628807270491, + "learning_rate": 4.121241988715992e-06, + "loss": 0.8384, + "step": 9583 + }, + { + "epoch": 4.344514959202176, + "grad_norm": 0.10508639722175796, + "learning_rate": 4.1156474206995515e-06, + "loss": 0.8431, + "step": 9584 + }, + { + "epoch": 4.344968268359021, + "grad_norm": 0.091319701351002, + "learning_rate": 4.110056446606479e-06, + "loss": 0.8488, + "step": 9585 + }, + { + "epoch": 4.345421577515866, + "grad_norm": 0.1159813804809012, + "learning_rate": 4.104469066996739e-06, + "loss": 0.8435, + "step": 9586 + }, + { + "epoch": 4.345874886672711, + "grad_norm": 0.11410085399347059, + "learning_rate": 4.098885282429908e-06, + "loss": 0.8665, + "step": 9587 + }, + { + "epoch": 4.346328195829556, + "grad_norm": 0.0939155509126875, + "learning_rate": 4.093305093465239e-06, + "loss": 0.8405, + "step": 9588 + }, + { + "epoch": 4.346781504986401, + "grad_norm": 0.11683616940080482, + "learning_rate": 4.087728500661579e-06, + "loss": 0.8537, + "step": 9589 + }, + { + "epoch": 4.3472348141432455, + "grad_norm": 0.31087960444833435, + "learning_rate": 4.082155504577467e-06, + "loss": 0.8786, + "step": 9590 + }, + { + "epoch": 4.347688123300091, + "grad_norm": 0.10919587748557662, + "learning_rate": 4.076586105771032e-06, + "loss": 0.8479, + "step": 9591 + }, + { + "epoch": 4.348141432456936, + "grad_norm": 0.11173245953328728, + "learning_rate": 4.071020304800084e-06, + "loss": 0.851, + "step": 9592 + }, + { + "epoch": 4.3485947416137805, + "grad_norm": 0.11307306524317325, + "learning_rate": 4.065458102222044e-06, + "loss": 0.8488, + "step": 9593 + }, + { + "epoch": 4.349048050770626, + "grad_norm": 0.3602448715999201, + "learning_rate": 4.059899498593978e-06, + "loss": 0.862, + "step": 9594 + }, + { + "epoch": 4.349501359927471, + "grad_norm": 0.09847950734987641, + "learning_rate": 4.054344494472608e-06, + "loss": 0.858, + "step": 9595 + }, + { + "epoch": 4.349954669084315, + "grad_norm": 0.1141584765014094, + "learning_rate": 4.048793090414278e-06, + "loss": 0.853, + "step": 9596 + }, + { + "epoch": 4.350407978241161, + "grad_norm": 0.10687667359826115, + "learning_rate": 4.043245286974981e-06, + "loss": 0.8466, + "step": 9597 + }, + { + "epoch": 4.350861287398005, + "grad_norm": 0.0990372389453144, + "learning_rate": 4.037701084710337e-06, + "loss": 0.865, + "step": 9598 + }, + { + "epoch": 4.35131459655485, + "grad_norm": 0.11640275434954254, + "learning_rate": 4.032160484175629e-06, + "loss": 0.8639, + "step": 9599 + }, + { + "epoch": 4.351767905711696, + "grad_norm": 0.10415461419773817, + "learning_rate": 4.0266234859257554e-06, + "loss": 0.8606, + "step": 9600 + }, + { + "epoch": 4.35222121486854, + "grad_norm": 0.12893834862469303, + "learning_rate": 4.021090090515265e-06, + "loss": 0.8531, + "step": 9601 + }, + { + "epoch": 4.352674524025385, + "grad_norm": 0.10611031172750578, + "learning_rate": 4.015560298498331e-06, + "loss": 0.8334, + "step": 9602 + }, + { + "epoch": 4.3531278331822305, + "grad_norm": 0.11382673361626583, + "learning_rate": 4.0100341104288e-06, + "loss": 0.847, + "step": 9603 + }, + { + "epoch": 4.353581142339075, + "grad_norm": 0.12020468017152526, + "learning_rate": 4.004511526860118e-06, + "loss": 0.8548, + "step": 9604 + }, + { + "epoch": 4.35403445149592, + "grad_norm": 0.0935731091970512, + "learning_rate": 3.998992548345402e-06, + "loss": 0.8363, + "step": 9605 + }, + { + "epoch": 4.354487760652765, + "grad_norm": 0.09844588795308024, + "learning_rate": 3.993477175437379e-06, + "loss": 0.8575, + "step": 9606 + }, + { + "epoch": 4.35494106980961, + "grad_norm": 0.11928770929863536, + "learning_rate": 3.987965408688448e-06, + "loss": 0.8651, + "step": 9607 + }, + { + "epoch": 4.355394378966455, + "grad_norm": 0.12285158059445621, + "learning_rate": 3.9824572486506106e-06, + "loss": 0.8545, + "step": 9608 + }, + { + "epoch": 4.3558476881233, + "grad_norm": 0.10257203853975304, + "learning_rate": 3.9769526958755375e-06, + "loss": 0.845, + "step": 9609 + }, + { + "epoch": 4.356300997280145, + "grad_norm": 0.10556115142264431, + "learning_rate": 3.971451750914526e-06, + "loss": 0.8905, + "step": 9610 + }, + { + "epoch": 4.35675430643699, + "grad_norm": 0.10613225750579079, + "learning_rate": 3.965954414318498e-06, + "loss": 0.8483, + "step": 9611 + }, + { + "epoch": 4.357207615593835, + "grad_norm": 0.10571964983793551, + "learning_rate": 3.960460686638046e-06, + "loss": 0.8678, + "step": 9612 + }, + { + "epoch": 4.35766092475068, + "grad_norm": 0.10357316862065369, + "learning_rate": 3.954970568423364e-06, + "loss": 0.8564, + "step": 9613 + }, + { + "epoch": 4.358114233907525, + "grad_norm": 0.10098229263870728, + "learning_rate": 3.949484060224324e-06, + "loss": 0.8619, + "step": 9614 + }, + { + "epoch": 4.35856754306437, + "grad_norm": 0.09223865882708672, + "learning_rate": 3.9440011625903985e-06, + "loss": 0.8621, + "step": 9615 + }, + { + "epoch": 4.359020852221215, + "grad_norm": 0.0949731744289951, + "learning_rate": 3.938521876070729e-06, + "loss": 0.8454, + "step": 9616 + }, + { + "epoch": 4.35947416137806, + "grad_norm": 0.10529520079764994, + "learning_rate": 3.933046201214073e-06, + "loss": 0.8684, + "step": 9617 + }, + { + "epoch": 4.359927470534905, + "grad_norm": 0.1013921295258887, + "learning_rate": 3.927574138568844e-06, + "loss": 0.8487, + "step": 9618 + }, + { + "epoch": 4.3603807796917495, + "grad_norm": 0.10512189383464561, + "learning_rate": 3.92210568868308e-06, + "loss": 0.8641, + "step": 9619 + }, + { + "epoch": 4.360834088848595, + "grad_norm": 0.10144885221817355, + "learning_rate": 3.916640852104459e-06, + "loss": 0.8692, + "step": 9620 + }, + { + "epoch": 4.36128739800544, + "grad_norm": 0.10399656601674749, + "learning_rate": 3.911179629380297e-06, + "loss": 0.8575, + "step": 9621 + }, + { + "epoch": 4.3617407071622845, + "grad_norm": 0.10221345684263244, + "learning_rate": 3.905722021057568e-06, + "loss": 0.8584, + "step": 9622 + }, + { + "epoch": 4.36219401631913, + "grad_norm": 0.10571118812991671, + "learning_rate": 3.90026802768285e-06, + "loss": 0.8385, + "step": 9623 + }, + { + "epoch": 4.362647325475975, + "grad_norm": 0.09533912604211935, + "learning_rate": 3.894817649802391e-06, + "loss": 0.8608, + "step": 9624 + }, + { + "epoch": 4.363100634632819, + "grad_norm": 0.09564212435108743, + "learning_rate": 3.889370887962059e-06, + "loss": 0.8561, + "step": 9625 + }, + { + "epoch": 4.363553943789665, + "grad_norm": 0.10121211777560803, + "learning_rate": 3.883927742707347e-06, + "loss": 0.8378, + "step": 9626 + }, + { + "epoch": 4.36400725294651, + "grad_norm": 0.10278931339176604, + "learning_rate": 3.878488214583427e-06, + "loss": 0.8451, + "step": 9627 + }, + { + "epoch": 4.364460562103354, + "grad_norm": 0.10225162012214147, + "learning_rate": 3.873052304135061e-06, + "loss": 0.854, + "step": 9628 + }, + { + "epoch": 4.3649138712602, + "grad_norm": 0.10042802612294256, + "learning_rate": 3.867620011906689e-06, + "loss": 0.8665, + "step": 9629 + }, + { + "epoch": 4.365367180417044, + "grad_norm": 0.1170436312884319, + "learning_rate": 3.862191338442353e-06, + "loss": 0.8559, + "step": 9630 + }, + { + "epoch": 4.365820489573889, + "grad_norm": 0.10029886969138437, + "learning_rate": 3.856766284285778e-06, + "loss": 0.8466, + "step": 9631 + }, + { + "epoch": 4.3662737987307345, + "grad_norm": 0.10301208871129991, + "learning_rate": 3.85134484998027e-06, + "loss": 0.8558, + "step": 9632 + }, + { + "epoch": 4.366727107887579, + "grad_norm": 0.0972354530097949, + "learning_rate": 3.8459270360688216e-06, + "loss": 0.8304, + "step": 9633 + }, + { + "epoch": 4.367180417044424, + "grad_norm": 0.11576614932779043, + "learning_rate": 3.840512843094027e-06, + "loss": 0.8655, + "step": 9634 + }, + { + "epoch": 4.367633726201269, + "grad_norm": 0.10820969747774425, + "learning_rate": 3.835102271598152e-06, + "loss": 0.8358, + "step": 9635 + }, + { + "epoch": 4.368087035358114, + "grad_norm": 0.11014685634309787, + "learning_rate": 3.8296953221230685e-06, + "loss": 0.8502, + "step": 9636 + }, + { + "epoch": 4.368540344514959, + "grad_norm": 0.10277250349913092, + "learning_rate": 3.824291995210296e-06, + "loss": 0.8393, + "step": 9637 + }, + { + "epoch": 4.368993653671804, + "grad_norm": 0.09626402422812319, + "learning_rate": 3.818892291401004e-06, + "loss": 0.8625, + "step": 9638 + }, + { + "epoch": 4.369446962828649, + "grad_norm": 0.1014550351989499, + "learning_rate": 3.8134962112359854e-06, + "loss": 0.8503, + "step": 9639 + }, + { + "epoch": 4.369900271985494, + "grad_norm": 0.09762871315857073, + "learning_rate": 3.80810375525567e-06, + "loss": 0.838, + "step": 9640 + }, + { + "epoch": 4.370353581142339, + "grad_norm": 0.09763424013515468, + "learning_rate": 3.8027149240001194e-06, + "loss": 0.8594, + "step": 9641 + }, + { + "epoch": 4.370806890299184, + "grad_norm": 0.09957803124016645, + "learning_rate": 3.7973297180090616e-06, + "loss": 0.8461, + "step": 9642 + }, + { + "epoch": 4.371260199456029, + "grad_norm": 0.10900531023102537, + "learning_rate": 3.7919481378218258e-06, + "loss": 0.8686, + "step": 9643 + }, + { + "epoch": 4.371713508612874, + "grad_norm": 0.09429192682357267, + "learning_rate": 3.7865701839773983e-06, + "loss": 0.8433, + "step": 9644 + }, + { + "epoch": 4.372166817769719, + "grad_norm": 0.09938614097060985, + "learning_rate": 3.7811958570143924e-06, + "loss": 0.8854, + "step": 9645 + }, + { + "epoch": 4.372620126926564, + "grad_norm": 0.10078764935592188, + "learning_rate": 3.7758251574710757e-06, + "loss": 0.8561, + "step": 9646 + }, + { + "epoch": 4.373073436083409, + "grad_norm": 0.09299997983013367, + "learning_rate": 3.77045808588532e-06, + "loss": 0.8712, + "step": 9647 + }, + { + "epoch": 4.3735267452402535, + "grad_norm": 0.09253436876123078, + "learning_rate": 3.765094642794673e-06, + "loss": 0.8429, + "step": 9648 + }, + { + "epoch": 4.373980054397099, + "grad_norm": 0.09596303193399927, + "learning_rate": 3.7597348287362833e-06, + "loss": 0.8325, + "step": 9649 + }, + { + "epoch": 4.374433363553944, + "grad_norm": 0.10563443289604134, + "learning_rate": 3.754378644246961e-06, + "loss": 0.8385, + "step": 9650 + }, + { + "epoch": 4.3748866727107885, + "grad_norm": 0.12004756884244323, + "learning_rate": 3.7490260898631483e-06, + "loss": 0.8412, + "step": 9651 + }, + { + "epoch": 4.375339981867634, + "grad_norm": 0.1145040499860451, + "learning_rate": 3.7436771661209e-06, + "loss": 0.8451, + "step": 9652 + }, + { + "epoch": 4.375793291024479, + "grad_norm": 0.10029331501191613, + "learning_rate": 3.7383318735559494e-06, + "loss": 0.8492, + "step": 9653 + }, + { + "epoch": 4.376246600181323, + "grad_norm": 0.09695323710686157, + "learning_rate": 3.732990212703622e-06, + "loss": 0.8724, + "step": 9654 + }, + { + "epoch": 4.376699909338169, + "grad_norm": 0.12367949002089236, + "learning_rate": 3.727652184098922e-06, + "loss": 0.8695, + "step": 9655 + }, + { + "epoch": 4.377153218495014, + "grad_norm": 0.10839472585054065, + "learning_rate": 3.7223177882764483e-06, + "loss": 0.8455, + "step": 9656 + }, + { + "epoch": 4.377606527651858, + "grad_norm": 0.09725569739440772, + "learning_rate": 3.7169870257704756e-06, + "loss": 0.8499, + "step": 9657 + }, + { + "epoch": 4.378059836808704, + "grad_norm": 0.09301171824587998, + "learning_rate": 3.711659897114883e-06, + "loss": 0.853, + "step": 9658 + }, + { + "epoch": 4.378513145965549, + "grad_norm": 0.10158812375584579, + "learning_rate": 3.706336402843205e-06, + "loss": 0.8554, + "step": 9659 + }, + { + "epoch": 4.378966455122393, + "grad_norm": 0.1022033921262482, + "learning_rate": 3.7010165434885913e-06, + "loss": 0.855, + "step": 9660 + }, + { + "epoch": 4.3794197642792385, + "grad_norm": 0.09176198435582744, + "learning_rate": 3.6957003195838615e-06, + "loss": 0.8545, + "step": 9661 + }, + { + "epoch": 4.379873073436084, + "grad_norm": 0.09236450894310463, + "learning_rate": 3.690387731661429e-06, + "loss": 0.8408, + "step": 9662 + }, + { + "epoch": 4.380326382592928, + "grad_norm": 0.10110102225560902, + "learning_rate": 3.6850787802533884e-06, + "loss": 0.8629, + "step": 9663 + }, + { + "epoch": 4.380779691749773, + "grad_norm": 0.10505161601451171, + "learning_rate": 3.67977346589143e-06, + "loss": 0.8527, + "step": 9664 + }, + { + "epoch": 4.381233000906619, + "grad_norm": 0.09366175611523732, + "learning_rate": 3.674471789106906e-06, + "loss": 0.8642, + "step": 9665 + }, + { + "epoch": 4.381686310063463, + "grad_norm": 0.1049154822384733, + "learning_rate": 3.6691737504307923e-06, + "loss": 0.8555, + "step": 9666 + }, + { + "epoch": 4.382139619220308, + "grad_norm": 0.10503417843245928, + "learning_rate": 3.6638793503936954e-06, + "loss": 0.8513, + "step": 9667 + }, + { + "epoch": 4.382592928377154, + "grad_norm": 0.0972081448856853, + "learning_rate": 3.6585885895258753e-06, + "loss": 0.853, + "step": 9668 + }, + { + "epoch": 4.383046237533998, + "grad_norm": 0.10266601308681904, + "learning_rate": 3.6533014683572066e-06, + "loss": 0.8418, + "step": 9669 + }, + { + "epoch": 4.383499546690843, + "grad_norm": 0.11038921270927661, + "learning_rate": 3.6480179874172296e-06, + "loss": 0.8609, + "step": 9670 + }, + { + "epoch": 4.383952855847689, + "grad_norm": 0.10551611496433301, + "learning_rate": 3.6427381472350765e-06, + "loss": 0.8633, + "step": 9671 + }, + { + "epoch": 4.384406165004533, + "grad_norm": 0.10219895890471832, + "learning_rate": 3.6374619483395555e-06, + "loss": 0.8617, + "step": 9672 + }, + { + "epoch": 4.384859474161378, + "grad_norm": 0.11124512844034248, + "learning_rate": 3.6321893912590844e-06, + "loss": 0.8558, + "step": 9673 + }, + { + "epoch": 4.3853127833182235, + "grad_norm": 0.10714297920888102, + "learning_rate": 3.6269204765217334e-06, + "loss": 0.8595, + "step": 9674 + }, + { + "epoch": 4.385766092475068, + "grad_norm": 0.10227365861525334, + "learning_rate": 3.6216552046551877e-06, + "loss": 0.8633, + "step": 9675 + }, + { + "epoch": 4.386219401631913, + "grad_norm": 0.10477502909275378, + "learning_rate": 3.6163935761867943e-06, + "loss": 0.847, + "step": 9676 + }, + { + "epoch": 4.3866727107887575, + "grad_norm": 0.10654321595803819, + "learning_rate": 3.6111355916435177e-06, + "loss": 0.8494, + "step": 9677 + }, + { + "epoch": 4.387126019945603, + "grad_norm": 0.10146837981936041, + "learning_rate": 3.6058812515519503e-06, + "loss": 0.8564, + "step": 9678 + }, + { + "epoch": 4.387579329102448, + "grad_norm": 0.10184226806034816, + "learning_rate": 3.600630556438334e-06, + "loss": 0.8515, + "step": 9679 + }, + { + "epoch": 4.3880326382592925, + "grad_norm": 0.10407372481035414, + "learning_rate": 3.595383506828549e-06, + "loss": 0.8634, + "step": 9680 + }, + { + "epoch": 4.388485947416138, + "grad_norm": 0.09397569392939616, + "learning_rate": 3.5901401032480964e-06, + "loss": 0.8619, + "step": 9681 + }, + { + "epoch": 4.388939256572983, + "grad_norm": 0.11944355783128183, + "learning_rate": 3.584900346222111e-06, + "loss": 0.8519, + "step": 9682 + }, + { + "epoch": 4.389392565729827, + "grad_norm": 0.09765326460821437, + "learning_rate": 3.5796642362753866e-06, + "loss": 0.8467, + "step": 9683 + }, + { + "epoch": 4.389845874886673, + "grad_norm": 0.10466545452358128, + "learning_rate": 3.5744317739323164e-06, + "loss": 0.8657, + "step": 9684 + }, + { + "epoch": 4.390299184043518, + "grad_norm": 0.11032922103090496, + "learning_rate": 3.5692029597169663e-06, + "loss": 0.8576, + "step": 9685 + }, + { + "epoch": 4.390752493200362, + "grad_norm": 0.10521645889673213, + "learning_rate": 3.563977794153002e-06, + "loss": 0.8476, + "step": 9686 + }, + { + "epoch": 4.391205802357208, + "grad_norm": 0.10290647419921253, + "learning_rate": 3.5587562777637508e-06, + "loss": 0.8362, + "step": 9687 + }, + { + "epoch": 4.391659111514053, + "grad_norm": 0.11708808943113638, + "learning_rate": 3.5535384110721503e-06, + "loss": 0.8488, + "step": 9688 + }, + { + "epoch": 4.392112420670897, + "grad_norm": 0.1096964468349136, + "learning_rate": 3.5483241946008008e-06, + "loss": 0.8459, + "step": 9689 + }, + { + "epoch": 4.3925657298277425, + "grad_norm": 0.10557024135905096, + "learning_rate": 3.5431136288719013e-06, + "loss": 0.8522, + "step": 9690 + }, + { + "epoch": 4.393019038984588, + "grad_norm": 0.10496572702982866, + "learning_rate": 3.537906714407329e-06, + "loss": 0.8399, + "step": 9691 + }, + { + "epoch": 4.393472348141432, + "grad_norm": 0.10540550528575668, + "learning_rate": 3.532703451728554e-06, + "loss": 0.8543, + "step": 9692 + }, + { + "epoch": 4.393925657298277, + "grad_norm": 0.10534878257029277, + "learning_rate": 3.5275038413566987e-06, + "loss": 0.8385, + "step": 9693 + }, + { + "epoch": 4.394378966455123, + "grad_norm": 0.10677609099396294, + "learning_rate": 3.5223078838125325e-06, + "loss": 0.8458, + "step": 9694 + }, + { + "epoch": 4.394832275611967, + "grad_norm": 0.10950861727377062, + "learning_rate": 3.5171155796164304e-06, + "loss": 0.8367, + "step": 9695 + }, + { + "epoch": 4.395285584768812, + "grad_norm": 0.10253953394513723, + "learning_rate": 3.5119269292884384e-06, + "loss": 0.8832, + "step": 9696 + }, + { + "epoch": 4.395738893925658, + "grad_norm": 0.16120492342864598, + "learning_rate": 3.5067419333481854e-06, + "loss": 0.8561, + "step": 9697 + }, + { + "epoch": 4.396192203082502, + "grad_norm": 0.13383650953453813, + "learning_rate": 3.50156059231499e-06, + "loss": 0.8461, + "step": 9698 + }, + { + "epoch": 4.396645512239347, + "grad_norm": 0.11407210493642476, + "learning_rate": 3.4963829067077605e-06, + "loss": 0.8707, + "step": 9699 + }, + { + "epoch": 4.397098821396193, + "grad_norm": 0.1090810604457818, + "learning_rate": 3.491208877045069e-06, + "loss": 0.857, + "step": 9700 + }, + { + "epoch": 4.397552130553037, + "grad_norm": 0.11070273732108292, + "learning_rate": 3.4860385038451015e-06, + "loss": 0.8561, + "step": 9701 + }, + { + "epoch": 4.398005439709882, + "grad_norm": 0.11323941356089022, + "learning_rate": 3.4808717876256925e-06, + "loss": 0.8367, + "step": 9702 + }, + { + "epoch": 4.3984587488667275, + "grad_norm": 0.12183104151503595, + "learning_rate": 3.475708728904299e-06, + "loss": 0.8483, + "step": 9703 + }, + { + "epoch": 4.398912058023572, + "grad_norm": 0.10232224326616429, + "learning_rate": 3.4705493281980273e-06, + "loss": 0.8565, + "step": 9704 + }, + { + "epoch": 4.399365367180417, + "grad_norm": 0.11340582586219225, + "learning_rate": 3.4653935860235887e-06, + "loss": 0.847, + "step": 9705 + }, + { + "epoch": 4.399818676337262, + "grad_norm": 0.13421251813075455, + "learning_rate": 3.4602415028973657e-06, + "loss": 0.8526, + "step": 9706 + }, + { + "epoch": 4.400271985494107, + "grad_norm": 0.09827160036045486, + "learning_rate": 3.455093079335341e-06, + "loss": 0.8517, + "step": 9707 + }, + { + "epoch": 4.400725294650952, + "grad_norm": 0.10082612529278165, + "learning_rate": 3.449948315853142e-06, + "loss": 0.8634, + "step": 9708 + }, + { + "epoch": 4.4011786038077965, + "grad_norm": 0.10212867439306983, + "learning_rate": 3.444807212966046e-06, + "loss": 0.8498, + "step": 9709 + }, + { + "epoch": 4.401631912964642, + "grad_norm": 0.10347318213558855, + "learning_rate": 3.4396697711889337e-06, + "loss": 0.8617, + "step": 9710 + }, + { + "epoch": 4.402085222121487, + "grad_norm": 0.0969533238072967, + "learning_rate": 3.4345359910363497e-06, + "loss": 0.8553, + "step": 9711 + }, + { + "epoch": 4.402538531278331, + "grad_norm": 0.10458744504304496, + "learning_rate": 3.4294058730224423e-06, + "loss": 0.8547, + "step": 9712 + }, + { + "epoch": 4.402991840435177, + "grad_norm": 0.10655371484797438, + "learning_rate": 3.424279417661023e-06, + "loss": 0.8604, + "step": 9713 + }, + { + "epoch": 4.403445149592022, + "grad_norm": 0.09760931084188945, + "learning_rate": 3.419156625465507e-06, + "loss": 0.8607, + "step": 9714 + }, + { + "epoch": 4.403898458748866, + "grad_norm": 0.10307216854059376, + "learning_rate": 3.4140374969489788e-06, + "loss": 0.8519, + "step": 9715 + }, + { + "epoch": 4.404351767905712, + "grad_norm": 0.10300644759534751, + "learning_rate": 3.408922032624107e-06, + "loss": 0.8664, + "step": 9716 + }, + { + "epoch": 4.404805077062557, + "grad_norm": 0.1021089686946012, + "learning_rate": 3.4038102330032374e-06, + "loss": 0.8614, + "step": 9717 + }, + { + "epoch": 4.405258386219401, + "grad_norm": 0.09219312569995125, + "learning_rate": 3.398702098598321e-06, + "loss": 0.8676, + "step": 9718 + }, + { + "epoch": 4.4057116953762465, + "grad_norm": 0.10840620595136531, + "learning_rate": 3.3935976299209617e-06, + "loss": 0.8682, + "step": 9719 + }, + { + "epoch": 4.406165004533092, + "grad_norm": 0.09915586977050005, + "learning_rate": 3.3884968274823813e-06, + "loss": 0.8586, + "step": 9720 + }, + { + "epoch": 4.406618313689936, + "grad_norm": 0.08805013416910085, + "learning_rate": 3.383399691793452e-06, + "loss": 0.8434, + "step": 9721 + }, + { + "epoch": 4.4070716228467814, + "grad_norm": 0.09550273455073427, + "learning_rate": 3.3783062233646533e-06, + "loss": 0.8768, + "step": 9722 + }, + { + "epoch": 4.407524932003627, + "grad_norm": 0.11032916196539841, + "learning_rate": 3.3732164227061115e-06, + "loss": 0.8569, + "step": 9723 + }, + { + "epoch": 4.407978241160471, + "grad_norm": 0.10387542047906377, + "learning_rate": 3.3681302903275912e-06, + "loss": 0.8563, + "step": 9724 + }, + { + "epoch": 4.408431550317316, + "grad_norm": 0.09655508271421018, + "learning_rate": 3.3630478267384813e-06, + "loss": 0.8527, + "step": 9725 + }, + { + "epoch": 4.408884859474162, + "grad_norm": 0.09399914194021225, + "learning_rate": 3.35796903244781e-06, + "loss": 0.8686, + "step": 9726 + }, + { + "epoch": 4.409338168631006, + "grad_norm": 0.09354279900280596, + "learning_rate": 3.352893907964219e-06, + "loss": 0.8439, + "step": 9727 + }, + { + "epoch": 4.409791477787851, + "grad_norm": 0.110312309089862, + "learning_rate": 3.347822453796017e-06, + "loss": 0.8585, + "step": 9728 + }, + { + "epoch": 4.410244786944697, + "grad_norm": 0.11171176520397486, + "learning_rate": 3.342754670451105e-06, + "loss": 0.8548, + "step": 9729 + }, + { + "epoch": 4.410698096101541, + "grad_norm": 0.09355483348094079, + "learning_rate": 3.337690558437059e-06, + "loss": 0.8546, + "step": 9730 + }, + { + "epoch": 4.411151405258386, + "grad_norm": 0.09966377054999442, + "learning_rate": 3.3326301182610376e-06, + "loss": 0.8335, + "step": 9731 + }, + { + "epoch": 4.4116047144152315, + "grad_norm": 0.10065239198130362, + "learning_rate": 3.3275733504298847e-06, + "loss": 0.8632, + "step": 9732 + }, + { + "epoch": 4.412058023572076, + "grad_norm": 0.09492132128863065, + "learning_rate": 3.322520255450039e-06, + "loss": 0.8469, + "step": 9733 + }, + { + "epoch": 4.412511332728921, + "grad_norm": 0.09624011061718196, + "learning_rate": 3.317470833827585e-06, + "loss": 0.8716, + "step": 9734 + }, + { + "epoch": 4.412964641885766, + "grad_norm": 0.10879379030073634, + "learning_rate": 3.312425086068225e-06, + "loss": 0.8664, + "step": 9735 + }, + { + "epoch": 4.413417951042611, + "grad_norm": 0.09849740995088774, + "learning_rate": 3.307383012677323e-06, + "loss": 0.8378, + "step": 9736 + }, + { + "epoch": 4.413871260199456, + "grad_norm": 0.09259670797151053, + "learning_rate": 3.302344614159849e-06, + "loss": 0.8387, + "step": 9737 + }, + { + "epoch": 4.414324569356301, + "grad_norm": 0.09882535145272149, + "learning_rate": 3.297309891020408e-06, + "loss": 0.84, + "step": 9738 + }, + { + "epoch": 4.414777878513146, + "grad_norm": 0.10338550187133277, + "learning_rate": 3.292278843763255e-06, + "loss": 0.8582, + "step": 9739 + }, + { + "epoch": 4.415231187669991, + "grad_norm": 0.10665813664268832, + "learning_rate": 3.2872514728922522e-06, + "loss": 0.868, + "step": 9740 + }, + { + "epoch": 4.415684496826836, + "grad_norm": 0.09340841740520173, + "learning_rate": 3.282227778910918e-06, + "loss": 0.8381, + "step": 9741 + }, + { + "epoch": 4.416137805983681, + "grad_norm": 0.09705487098008145, + "learning_rate": 3.277207762322374e-06, + "loss": 0.8719, + "step": 9742 + }, + { + "epoch": 4.416591115140526, + "grad_norm": 0.11035354932311331, + "learning_rate": 3.2721914236294095e-06, + "loss": 0.8736, + "step": 9743 + }, + { + "epoch": 4.417044424297371, + "grad_norm": 0.09273209438526024, + "learning_rate": 3.2671787633344043e-06, + "loss": 0.8698, + "step": 9744 + }, + { + "epoch": 4.417497733454216, + "grad_norm": 0.09290992494930833, + "learning_rate": 3.2621697819394106e-06, + "loss": 0.8467, + "step": 9745 + }, + { + "epoch": 4.417951042611061, + "grad_norm": 0.10322854575396648, + "learning_rate": 3.2571644799460756e-06, + "loss": 0.863, + "step": 9746 + }, + { + "epoch": 4.418404351767906, + "grad_norm": 0.10560106991347797, + "learning_rate": 3.252162857855714e-06, + "loss": 0.847, + "step": 9747 + }, + { + "epoch": 4.4188576609247505, + "grad_norm": 0.08893353610797623, + "learning_rate": 3.2471649161692366e-06, + "loss": 0.8649, + "step": 9748 + }, + { + "epoch": 4.419310970081596, + "grad_norm": 0.09435768705610957, + "learning_rate": 3.2421706553872067e-06, + "loss": 0.8564, + "step": 9749 + }, + { + "epoch": 4.419764279238441, + "grad_norm": 0.09767015891797486, + "learning_rate": 3.237180076009816e-06, + "loss": 0.8598, + "step": 9750 + }, + { + "epoch": 4.4202175883952854, + "grad_norm": 0.09739244122989335, + "learning_rate": 3.2321931785368823e-06, + "loss": 0.8463, + "step": 9751 + }, + { + "epoch": 4.420670897552131, + "grad_norm": 0.09403201410654198, + "learning_rate": 3.2272099634678633e-06, + "loss": 0.8408, + "step": 9752 + }, + { + "epoch": 4.421124206708976, + "grad_norm": 0.08746916890168699, + "learning_rate": 3.222230431301836e-06, + "loss": 0.8414, + "step": 9753 + }, + { + "epoch": 4.42157751586582, + "grad_norm": 0.1003914419937456, + "learning_rate": 3.21725458253753e-06, + "loss": 0.8574, + "step": 9754 + }, + { + "epoch": 4.422030825022666, + "grad_norm": 0.09302141802108978, + "learning_rate": 3.212282417673267e-06, + "loss": 0.8263, + "step": 9755 + }, + { + "epoch": 4.42248413417951, + "grad_norm": 0.09921741361103302, + "learning_rate": 3.207313937207044e-06, + "loss": 0.846, + "step": 9756 + }, + { + "epoch": 4.422937443336355, + "grad_norm": 0.10279309221297517, + "learning_rate": 3.202349141636454e-06, + "loss": 0.8717, + "step": 9757 + }, + { + "epoch": 4.423390752493201, + "grad_norm": 0.09081468215181218, + "learning_rate": 3.1973880314587526e-06, + "loss": 0.8458, + "step": 9758 + }, + { + "epoch": 4.423844061650045, + "grad_norm": 0.10504504427488492, + "learning_rate": 3.1924306071707956e-06, + "loss": 0.8455, + "step": 9759 + }, + { + "epoch": 4.42429737080689, + "grad_norm": 0.09960677439342813, + "learning_rate": 3.187476869269093e-06, + "loss": 0.8452, + "step": 9760 + }, + { + "epoch": 4.4247506799637355, + "grad_norm": 0.09460657129411111, + "learning_rate": 3.1825268182497626e-06, + "loss": 0.8297, + "step": 9761 + }, + { + "epoch": 4.42520398912058, + "grad_norm": 0.09794971111919408, + "learning_rate": 3.1775804546085863e-06, + "loss": 0.8328, + "step": 9762 + }, + { + "epoch": 4.425657298277425, + "grad_norm": 0.08995589342561698, + "learning_rate": 3.1726377788409457e-06, + "loss": 0.8461, + "step": 9763 + }, + { + "epoch": 4.42611060743427, + "grad_norm": 0.11561130514557028, + "learning_rate": 3.167698791441858e-06, + "loss": 0.8531, + "step": 9764 + }, + { + "epoch": 4.426563916591115, + "grad_norm": 0.10464441456630647, + "learning_rate": 3.162763492905994e-06, + "loss": 0.8425, + "step": 9765 + }, + { + "epoch": 4.42701722574796, + "grad_norm": 0.09827920232204179, + "learning_rate": 3.1578318837276155e-06, + "loss": 0.8464, + "step": 9766 + }, + { + "epoch": 4.427470534904805, + "grad_norm": 0.08641858399968314, + "learning_rate": 3.152903964400662e-06, + "loss": 0.832, + "step": 9767 + }, + { + "epoch": 4.42792384406165, + "grad_norm": 0.10023147677261487, + "learning_rate": 3.1479797354186626e-06, + "loss": 0.8515, + "step": 9768 + }, + { + "epoch": 4.428377153218495, + "grad_norm": 0.11113042510729931, + "learning_rate": 3.1430591972748047e-06, + "loss": 0.8595, + "step": 9769 + }, + { + "epoch": 4.42883046237534, + "grad_norm": 0.10227864284096486, + "learning_rate": 3.1381423504618813e-06, + "loss": 0.8396, + "step": 9770 + }, + { + "epoch": 4.429283771532185, + "grad_norm": 0.09691220505620973, + "learning_rate": 3.1332291954723425e-06, + "loss": 0.8507, + "step": 9771 + }, + { + "epoch": 4.42973708068903, + "grad_norm": 0.09680666034953203, + "learning_rate": 3.128319732798244e-06, + "loss": 0.8477, + "step": 9772 + }, + { + "epoch": 4.430190389845875, + "grad_norm": 0.11388904269806488, + "learning_rate": 3.123413962931299e-06, + "loss": 0.8602, + "step": 9773 + }, + { + "epoch": 4.43064369900272, + "grad_norm": 0.09267255139185787, + "learning_rate": 3.118511886362816e-06, + "loss": 0.845, + "step": 9774 + }, + { + "epoch": 4.431097008159565, + "grad_norm": 0.10039317777803003, + "learning_rate": 3.1136135035837633e-06, + "loss": 0.853, + "step": 9775 + }, + { + "epoch": 4.43155031731641, + "grad_norm": 0.10915904053079255, + "learning_rate": 3.108718815084717e-06, + "loss": 0.8685, + "step": 9776 + }, + { + "epoch": 4.4320036264732545, + "grad_norm": 0.09065160859662667, + "learning_rate": 3.1038278213559112e-06, + "loss": 0.8589, + "step": 9777 + }, + { + "epoch": 4.4324569356301, + "grad_norm": 0.09912757695575959, + "learning_rate": 3.098940522887177e-06, + "loss": 0.8451, + "step": 9778 + }, + { + "epoch": 4.432910244786945, + "grad_norm": 0.09891097031211332, + "learning_rate": 3.094056920168007e-06, + "loss": 0.8691, + "step": 9779 + }, + { + "epoch": 4.4333635539437894, + "grad_norm": 0.11061656197581696, + "learning_rate": 3.089177013687503e-06, + "loss": 0.8562, + "step": 9780 + }, + { + "epoch": 4.433816863100635, + "grad_norm": 0.09998224714576705, + "learning_rate": 3.0843008039343903e-06, + "loss": 0.8595, + "step": 9781 + }, + { + "epoch": 4.43427017225748, + "grad_norm": 0.10558042933120089, + "learning_rate": 3.079428291397055e-06, + "loss": 0.8661, + "step": 9782 + }, + { + "epoch": 4.434723481414324, + "grad_norm": 0.10673004339557234, + "learning_rate": 3.074559476563477e-06, + "loss": 0.853, + "step": 9783 + }, + { + "epoch": 4.43517679057117, + "grad_norm": 0.09457700042252078, + "learning_rate": 3.0696943599212915e-06, + "loss": 0.8478, + "step": 9784 + }, + { + "epoch": 4.435630099728015, + "grad_norm": 0.09915679646504516, + "learning_rate": 3.0648329419577493e-06, + "loss": 0.8679, + "step": 9785 + }, + { + "epoch": 4.436083408884859, + "grad_norm": 0.09681904705126486, + "learning_rate": 3.0599752231597457e-06, + "loss": 0.8697, + "step": 9786 + }, + { + "epoch": 4.436536718041705, + "grad_norm": 0.08577150301262598, + "learning_rate": 3.05512120401378e-06, + "loss": 0.8567, + "step": 9787 + }, + { + "epoch": 4.436990027198549, + "grad_norm": 0.09131097826555531, + "learning_rate": 3.05027088500601e-06, + "loss": 0.8507, + "step": 9788 + }, + { + "epoch": 4.437443336355394, + "grad_norm": 0.09191315719819511, + "learning_rate": 3.0454242666222078e-06, + "loss": 0.8468, + "step": 9789 + }, + { + "epoch": 4.4378966455122395, + "grad_norm": 0.0978098289450232, + "learning_rate": 3.0405813493477622e-06, + "loss": 0.8503, + "step": 9790 + }, + { + "epoch": 4.438349954669084, + "grad_norm": 0.08466808310152325, + "learning_rate": 3.0357421336677252e-06, + "loss": 0.8719, + "step": 9791 + }, + { + "epoch": 4.438803263825929, + "grad_norm": 0.0939283662030847, + "learning_rate": 3.030906620066749e-06, + "loss": 0.844, + "step": 9792 + }, + { + "epoch": 4.439256572982774, + "grad_norm": 0.10196304543846288, + "learning_rate": 3.026074809029118e-06, + "loss": 0.8702, + "step": 9793 + }, + { + "epoch": 4.439709882139619, + "grad_norm": 0.09694075454483896, + "learning_rate": 3.0212467010387648e-06, + "loss": 0.8425, + "step": 9794 + }, + { + "epoch": 4.440163191296464, + "grad_norm": 0.10642795376050981, + "learning_rate": 3.0164222965792355e-06, + "loss": 0.8447, + "step": 9795 + }, + { + "epoch": 4.440616500453309, + "grad_norm": 0.08949875139694335, + "learning_rate": 3.0116015961337e-06, + "loss": 0.8588, + "step": 9796 + }, + { + "epoch": 4.441069809610154, + "grad_norm": 0.10195312274270375, + "learning_rate": 3.0067846001849753e-06, + "loss": 0.8406, + "step": 9797 + }, + { + "epoch": 4.441523118766999, + "grad_norm": 0.11080134584942648, + "learning_rate": 3.001971309215486e-06, + "loss": 0.8543, + "step": 9798 + }, + { + "epoch": 4.441976427923844, + "grad_norm": 0.09599996223468339, + "learning_rate": 2.9971617237073157e-06, + "loss": 0.8321, + "step": 9799 + }, + { + "epoch": 4.442429737080689, + "grad_norm": 0.09492513023360381, + "learning_rate": 2.9923558441421383e-06, + "loss": 0.8577, + "step": 9800 + }, + { + "epoch": 4.442883046237534, + "grad_norm": 0.09673875243133613, + "learning_rate": 2.9875536710012974e-06, + "loss": 0.8512, + "step": 9801 + }, + { + "epoch": 4.443336355394379, + "grad_norm": 0.09972727950810681, + "learning_rate": 2.982755204765724e-06, + "loss": 0.8576, + "step": 9802 + }, + { + "epoch": 4.443789664551224, + "grad_norm": 0.09887202990138233, + "learning_rate": 2.977960445916015e-06, + "loss": 0.8402, + "step": 9803 + }, + { + "epoch": 4.444242973708069, + "grad_norm": 0.0986590407127255, + "learning_rate": 2.9731693949323693e-06, + "loss": 0.8586, + "step": 9804 + }, + { + "epoch": 4.444696282864914, + "grad_norm": 0.09699533073641962, + "learning_rate": 2.968382052294634e-06, + "loss": 0.8407, + "step": 9805 + }, + { + "epoch": 4.4451495920217585, + "grad_norm": 0.10219219659671194, + "learning_rate": 2.9635984184822695e-06, + "loss": 0.8583, + "step": 9806 + }, + { + "epoch": 4.445602901178604, + "grad_norm": 0.09461433014932304, + "learning_rate": 2.958818493974365e-06, + "loss": 0.8797, + "step": 9807 + }, + { + "epoch": 4.446056210335449, + "grad_norm": 0.13382920392572473, + "learning_rate": 2.954042279249656e-06, + "loss": 0.8689, + "step": 9808 + }, + { + "epoch": 4.4465095194922934, + "grad_norm": 0.11394606667448955, + "learning_rate": 2.949269774786481e-06, + "loss": 0.8757, + "step": 9809 + }, + { + "epoch": 4.446962828649139, + "grad_norm": 0.0987973530497954, + "learning_rate": 2.9445009810628346e-06, + "loss": 0.8541, + "step": 9810 + }, + { + "epoch": 4.447416137805984, + "grad_norm": 0.10120029784699906, + "learning_rate": 2.939735898556322e-06, + "loss": 0.847, + "step": 9811 + }, + { + "epoch": 4.447869446962828, + "grad_norm": 0.09993735294221158, + "learning_rate": 2.934974527744179e-06, + "loss": 0.8474, + "step": 9812 + }, + { + "epoch": 4.448322756119674, + "grad_norm": 0.10204804276275686, + "learning_rate": 2.930216869103255e-06, + "loss": 0.8559, + "step": 9813 + }, + { + "epoch": 4.448776065276519, + "grad_norm": 0.08842197328634625, + "learning_rate": 2.9254629231100716e-06, + "loss": 0.8471, + "step": 9814 + }, + { + "epoch": 4.449229374433363, + "grad_norm": 0.11073744533902806, + "learning_rate": 2.920712690240728e-06, + "loss": 0.846, + "step": 9815 + }, + { + "epoch": 4.449682683590209, + "grad_norm": 0.10336390413990273, + "learning_rate": 2.9159661709709852e-06, + "loss": 0.8497, + "step": 9816 + }, + { + "epoch": 4.450135992747054, + "grad_norm": 0.09173579637942562, + "learning_rate": 2.9112233657762148e-06, + "loss": 0.8522, + "step": 9817 + }, + { + "epoch": 4.450589301903898, + "grad_norm": 0.09489741668768972, + "learning_rate": 2.906484275131436e-06, + "loss": 0.8614, + "step": 9818 + }, + { + "epoch": 4.4510426110607435, + "grad_norm": 0.08413940431918063, + "learning_rate": 2.9017488995112608e-06, + "loss": 0.8427, + "step": 9819 + }, + { + "epoch": 4.451495920217589, + "grad_norm": 0.09864943872877066, + "learning_rate": 2.897017239389972e-06, + "loss": 0.8614, + "step": 9820 + }, + { + "epoch": 4.451949229374433, + "grad_norm": 0.09791187662433061, + "learning_rate": 2.8922892952414527e-06, + "loss": 0.8628, + "step": 9821 + }, + { + "epoch": 4.452402538531278, + "grad_norm": 0.09145485326710591, + "learning_rate": 2.8875650675392086e-06, + "loss": 0.8583, + "step": 9822 + }, + { + "epoch": 4.452855847688124, + "grad_norm": 0.09325625247205645, + "learning_rate": 2.882844556756399e-06, + "loss": 0.8719, + "step": 9823 + }, + { + "epoch": 4.453309156844968, + "grad_norm": 0.09209951856638655, + "learning_rate": 2.878127763365788e-06, + "loss": 0.8556, + "step": 9824 + }, + { + "epoch": 4.453762466001813, + "grad_norm": 0.10331513465910006, + "learning_rate": 2.873414687839788e-06, + "loss": 0.8521, + "step": 9825 + }, + { + "epoch": 4.454215775158659, + "grad_norm": 0.09181353142720594, + "learning_rate": 2.8687053306504144e-06, + "loss": 0.8283, + "step": 9826 + }, + { + "epoch": 4.454669084315503, + "grad_norm": 0.10194446831682298, + "learning_rate": 2.8639996922693325e-06, + "loss": 0.8512, + "step": 9827 + }, + { + "epoch": 4.455122393472348, + "grad_norm": 0.0921352359608796, + "learning_rate": 2.8592977731678194e-06, + "loss": 0.8722, + "step": 9828 + }, + { + "epoch": 4.455575702629194, + "grad_norm": 0.08748221355222349, + "learning_rate": 2.854599573816792e-06, + "loss": 0.8617, + "step": 9829 + }, + { + "epoch": 4.456029011786038, + "grad_norm": 0.09301592731580965, + "learning_rate": 2.8499050946867846e-06, + "loss": 0.8355, + "step": 9830 + }, + { + "epoch": 4.456482320942883, + "grad_norm": 0.10778828863927568, + "learning_rate": 2.845214336247968e-06, + "loss": 0.8322, + "step": 9831 + }, + { + "epoch": 4.4569356300997285, + "grad_norm": 0.0976486261967195, + "learning_rate": 2.8405272989701215e-06, + "loss": 0.8695, + "step": 9832 + }, + { + "epoch": 4.457388939256573, + "grad_norm": 0.09409332844956186, + "learning_rate": 2.8358439833226836e-06, + "loss": 0.8536, + "step": 9833 + }, + { + "epoch": 4.457842248413418, + "grad_norm": 0.09737098838867723, + "learning_rate": 2.831164389774688e-06, + "loss": 0.8517, + "step": 9834 + }, + { + "epoch": 4.4582955575702625, + "grad_norm": 0.09700614669816326, + "learning_rate": 2.826488518794821e-06, + "loss": 0.8674, + "step": 9835 + }, + { + "epoch": 4.458748866727108, + "grad_norm": 0.09005934951493812, + "learning_rate": 2.8218163708513847e-06, + "loss": 0.839, + "step": 9836 + }, + { + "epoch": 4.459202175883953, + "grad_norm": 0.09715632305855748, + "learning_rate": 2.817147946412293e-06, + "loss": 0.8529, + "step": 9837 + }, + { + "epoch": 4.4596554850407975, + "grad_norm": 0.09645047169232633, + "learning_rate": 2.8124832459451192e-06, + "loss": 0.8468, + "step": 9838 + }, + { + "epoch": 4.460108794197643, + "grad_norm": 0.09504894051272196, + "learning_rate": 2.807822269917031e-06, + "loss": 0.8637, + "step": 9839 + }, + { + "epoch": 4.460562103354488, + "grad_norm": 0.09094650600043051, + "learning_rate": 2.80316501879486e-06, + "loss": 0.8533, + "step": 9840 + }, + { + "epoch": 4.461015412511332, + "grad_norm": 0.08723174425740271, + "learning_rate": 2.798511493045024e-06, + "loss": 0.8435, + "step": 9841 + }, + { + "epoch": 4.461468721668178, + "grad_norm": 0.09301923135505631, + "learning_rate": 2.7938616931335994e-06, + "loss": 0.8518, + "step": 9842 + }, + { + "epoch": 4.461922030825023, + "grad_norm": 0.09175170041253297, + "learning_rate": 2.789215619526271e-06, + "loss": 0.8597, + "step": 9843 + }, + { + "epoch": 4.462375339981867, + "grad_norm": 0.09470159133265758, + "learning_rate": 2.78457327268836e-06, + "loss": 0.8528, + "step": 9844 + }, + { + "epoch": 4.462828649138713, + "grad_norm": 0.10393960949662646, + "learning_rate": 2.779934653084806e-06, + "loss": 0.8611, + "step": 9845 + }, + { + "epoch": 4.463281958295558, + "grad_norm": 0.09341483855892176, + "learning_rate": 2.7752997611801925e-06, + "loss": 0.8522, + "step": 9846 + }, + { + "epoch": 4.463735267452402, + "grad_norm": 0.09743672675439227, + "learning_rate": 2.7706685974387083e-06, + "loss": 0.8539, + "step": 9847 + }, + { + "epoch": 4.4641885766092475, + "grad_norm": 0.10104872190646559, + "learning_rate": 2.766041162324169e-06, + "loss": 0.8465, + "step": 9848 + }, + { + "epoch": 4.464641885766093, + "grad_norm": 0.08911639449185851, + "learning_rate": 2.761417456300044e-06, + "loss": 0.8452, + "step": 9849 + }, + { + "epoch": 4.465095194922937, + "grad_norm": 0.09487655424798644, + "learning_rate": 2.7567974798294073e-06, + "loss": 0.8567, + "step": 9850 + }, + { + "epoch": 4.465548504079782, + "grad_norm": 0.12033869308057313, + "learning_rate": 2.7521812333749553e-06, + "loss": 0.8645, + "step": 9851 + }, + { + "epoch": 4.466001813236628, + "grad_norm": 0.10950466556691843, + "learning_rate": 2.747568717399012e-06, + "loss": 0.8309, + "step": 9852 + }, + { + "epoch": 4.466455122393472, + "grad_norm": 0.10410776322763514, + "learning_rate": 2.742959932363558e-06, + "loss": 0.8494, + "step": 9853 + }, + { + "epoch": 4.466908431550317, + "grad_norm": 0.12036445692177834, + "learning_rate": 2.7383548787301537e-06, + "loss": 0.8578, + "step": 9854 + }, + { + "epoch": 4.467361740707163, + "grad_norm": 0.10548359809209708, + "learning_rate": 2.7337535569600217e-06, + "loss": 0.8568, + "step": 9855 + }, + { + "epoch": 4.467815049864007, + "grad_norm": 0.10796419262645278, + "learning_rate": 2.7291559675139924e-06, + "loss": 0.8419, + "step": 9856 + }, + { + "epoch": 4.468268359020852, + "grad_norm": 0.09817977484298335, + "learning_rate": 2.7245621108525333e-06, + "loss": 0.8471, + "step": 9857 + }, + { + "epoch": 4.468721668177698, + "grad_norm": 0.12217156224877203, + "learning_rate": 2.7199719874357255e-06, + "loss": 0.8465, + "step": 9858 + }, + { + "epoch": 4.469174977334542, + "grad_norm": 0.09327830857157184, + "learning_rate": 2.71538559772329e-06, + "loss": 0.8699, + "step": 9859 + }, + { + "epoch": 4.469628286491387, + "grad_norm": 0.10432114586739148, + "learning_rate": 2.710802942174562e-06, + "loss": 0.8512, + "step": 9860 + }, + { + "epoch": 4.4700815956482325, + "grad_norm": 0.1123484154964966, + "learning_rate": 2.706224021248516e-06, + "loss": 0.8745, + "step": 9861 + }, + { + "epoch": 4.470534904805077, + "grad_norm": 0.1007055357944021, + "learning_rate": 2.701648835403736e-06, + "loss": 0.8619, + "step": 9862 + }, + { + "epoch": 4.470988213961922, + "grad_norm": 0.10860850297377181, + "learning_rate": 2.6970773850984388e-06, + "loss": 0.8605, + "step": 9863 + }, + { + "epoch": 4.471441523118767, + "grad_norm": 0.12852550898867424, + "learning_rate": 2.6925096707904797e-06, + "loss": 0.8441, + "step": 9864 + }, + { + "epoch": 4.471894832275612, + "grad_norm": 0.09900551249733269, + "learning_rate": 2.6879456929373104e-06, + "loss": 0.8616, + "step": 9865 + }, + { + "epoch": 4.472348141432457, + "grad_norm": 0.09769833998997382, + "learning_rate": 2.68338545199605e-06, + "loss": 0.8599, + "step": 9866 + }, + { + "epoch": 4.4728014505893015, + "grad_norm": 0.10821005491546797, + "learning_rate": 2.6788289484233996e-06, + "loss": 0.8658, + "step": 9867 + }, + { + "epoch": 4.473254759746147, + "grad_norm": 0.10845038914179912, + "learning_rate": 2.6742761826757237e-06, + "loss": 0.8563, + "step": 9868 + }, + { + "epoch": 4.473708068902992, + "grad_norm": 0.0909588525949302, + "learning_rate": 2.6697271552089808e-06, + "loss": 0.862, + "step": 9869 + }, + { + "epoch": 4.474161378059836, + "grad_norm": 0.10055485778311472, + "learning_rate": 2.665181866478781e-06, + "loss": 0.8378, + "step": 9870 + }, + { + "epoch": 4.474614687216682, + "grad_norm": 0.11033577376521939, + "learning_rate": 2.660640316940333e-06, + "loss": 0.8621, + "step": 9871 + }, + { + "epoch": 4.475067996373527, + "grad_norm": 0.09420682705616118, + "learning_rate": 2.6561025070485034e-06, + "loss": 0.864, + "step": 9872 + }, + { + "epoch": 4.475521305530371, + "grad_norm": 0.08742469922703536, + "learning_rate": 2.651568437257752e-06, + "loss": 0.8553, + "step": 9873 + }, + { + "epoch": 4.475974614687217, + "grad_norm": 0.09545635905546843, + "learning_rate": 2.6470381080221906e-06, + "loss": 0.8339, + "step": 9874 + }, + { + "epoch": 4.476427923844062, + "grad_norm": 0.10551080412470897, + "learning_rate": 2.6425115197955364e-06, + "loss": 0.8518, + "step": 9875 + }, + { + "epoch": 4.476881233000906, + "grad_norm": 0.09970782005370193, + "learning_rate": 2.637988673031151e-06, + "loss": 0.8432, + "step": 9876 + }, + { + "epoch": 4.4773345421577515, + "grad_norm": 0.10220917921876188, + "learning_rate": 2.63346956818201e-06, + "loss": 0.8532, + "step": 9877 + }, + { + "epoch": 4.477787851314597, + "grad_norm": 0.09690762174001893, + "learning_rate": 2.6289542057006978e-06, + "loss": 0.8402, + "step": 9878 + }, + { + "epoch": 4.478241160471441, + "grad_norm": 0.09130974840282878, + "learning_rate": 2.624442586039462e-06, + "loss": 0.8396, + "step": 9879 + }, + { + "epoch": 4.478694469628286, + "grad_norm": 0.10212495680719759, + "learning_rate": 2.619934709650136e-06, + "loss": 0.8462, + "step": 9880 + }, + { + "epoch": 4.479147778785132, + "grad_norm": 0.09698468965358502, + "learning_rate": 2.615430576984217e-06, + "loss": 0.8522, + "step": 9881 + }, + { + "epoch": 4.479601087941976, + "grad_norm": 0.086270683408742, + "learning_rate": 2.610930188492793e-06, + "loss": 0.8534, + "step": 9882 + }, + { + "epoch": 4.480054397098821, + "grad_norm": 0.13516961956447665, + "learning_rate": 2.6064335446265963e-06, + "loss": 0.858, + "step": 9883 + }, + { + "epoch": 4.480507706255667, + "grad_norm": 0.09716536694040769, + "learning_rate": 2.601940645835974e-06, + "loss": 0.8597, + "step": 9884 + }, + { + "epoch": 4.480961015412511, + "grad_norm": 0.08416996369115035, + "learning_rate": 2.597451492570913e-06, + "loss": 0.8522, + "step": 9885 + }, + { + "epoch": 4.481414324569356, + "grad_norm": 0.10467763504432315, + "learning_rate": 2.592966085280999e-06, + "loss": 0.8744, + "step": 9886 + }, + { + "epoch": 4.481867633726202, + "grad_norm": 0.0913359581555134, + "learning_rate": 2.5884844244154782e-06, + "loss": 0.861, + "step": 9887 + }, + { + "epoch": 4.482320942883046, + "grad_norm": 0.08902607024167221, + "learning_rate": 2.584006510423196e-06, + "loss": 0.8781, + "step": 9888 + }, + { + "epoch": 4.482774252039891, + "grad_norm": 0.08545980871469581, + "learning_rate": 2.57953234375262e-06, + "loss": 0.8448, + "step": 9889 + }, + { + "epoch": 4.4832275611967365, + "grad_norm": 0.09074576136895415, + "learning_rate": 2.575061924851854e-06, + "loss": 0.8613, + "step": 9890 + }, + { + "epoch": 4.483680870353581, + "grad_norm": 0.11049792945962843, + "learning_rate": 2.5705952541686285e-06, + "loss": 0.8611, + "step": 9891 + }, + { + "epoch": 4.484134179510426, + "grad_norm": 0.10374092658908003, + "learning_rate": 2.5661323321502974e-06, + "loss": 0.8832, + "step": 9892 + }, + { + "epoch": 4.484587488667271, + "grad_norm": 0.10470161825312851, + "learning_rate": 2.561673159243818e-06, + "loss": 0.8662, + "step": 9893 + }, + { + "epoch": 4.485040797824116, + "grad_norm": 0.09868204955145479, + "learning_rate": 2.557217735895807e-06, + "loss": 0.8714, + "step": 9894 + }, + { + "epoch": 4.485494106980961, + "grad_norm": 0.091389212637639, + "learning_rate": 2.5527660625524763e-06, + "loss": 0.8682, + "step": 9895 + }, + { + "epoch": 4.485947416137806, + "grad_norm": 0.09403046669028392, + "learning_rate": 2.5483181396596914e-06, + "loss": 0.8612, + "step": 9896 + }, + { + "epoch": 4.486400725294651, + "grad_norm": 0.09295118669632577, + "learning_rate": 2.5438739676629e-06, + "loss": 0.8591, + "step": 9897 + }, + { + "epoch": 4.486854034451496, + "grad_norm": 0.09612049079107882, + "learning_rate": 2.539433547007222e-06, + "loss": 0.8521, + "step": 9898 + }, + { + "epoch": 4.487307343608341, + "grad_norm": 0.09124633155527934, + "learning_rate": 2.534996878137359e-06, + "loss": 0.8492, + "step": 9899 + }, + { + "epoch": 4.487760652765186, + "grad_norm": 0.09444878346326233, + "learning_rate": 2.5305639614976763e-06, + "loss": 0.8435, + "step": 9900 + }, + { + "epoch": 4.488213961922031, + "grad_norm": 0.09369261654700178, + "learning_rate": 2.526134797532129e-06, + "loss": 0.8497, + "step": 9901 + }, + { + "epoch": 4.488667271078876, + "grad_norm": 0.09182813055426814, + "learning_rate": 2.521709386684319e-06, + "loss": 0.8682, + "step": 9902 + }, + { + "epoch": 4.489120580235721, + "grad_norm": 0.09295029718742555, + "learning_rate": 2.5172877293974595e-06, + "loss": 0.8546, + "step": 9903 + }, + { + "epoch": 4.489573889392566, + "grad_norm": 0.0881049400763621, + "learning_rate": 2.512869826114388e-06, + "loss": 0.8637, + "step": 9904 + }, + { + "epoch": 4.490027198549411, + "grad_norm": 0.09666240659718033, + "learning_rate": 2.5084556772775814e-06, + "loss": 0.8658, + "step": 9905 + }, + { + "epoch": 4.4904805077062555, + "grad_norm": 0.09799390410225377, + "learning_rate": 2.5040452833291173e-06, + "loss": 0.8723, + "step": 9906 + }, + { + "epoch": 4.490933816863101, + "grad_norm": 0.09677587612851023, + "learning_rate": 2.499638644710731e-06, + "loss": 0.8809, + "step": 9907 + }, + { + "epoch": 4.491387126019946, + "grad_norm": 0.09469674932783308, + "learning_rate": 2.4952357618637367e-06, + "loss": 0.8606, + "step": 9908 + }, + { + "epoch": 4.49184043517679, + "grad_norm": 0.09026139241666141, + "learning_rate": 2.490836635229106e-06, + "loss": 0.8467, + "step": 9909 + }, + { + "epoch": 4.492293744333636, + "grad_norm": 0.09608141435963435, + "learning_rate": 2.486441265247419e-06, + "loss": 0.8544, + "step": 9910 + }, + { + "epoch": 4.49274705349048, + "grad_norm": 0.08835866793107541, + "learning_rate": 2.482049652358898e-06, + "loss": 0.871, + "step": 9911 + }, + { + "epoch": 4.493200362647325, + "grad_norm": 0.09384829781616197, + "learning_rate": 2.477661797003359e-06, + "loss": 0.8575, + "step": 9912 + }, + { + "epoch": 4.493653671804171, + "grad_norm": 0.09412320182175023, + "learning_rate": 2.4732776996202778e-06, + "loss": 0.8463, + "step": 9913 + }, + { + "epoch": 4.494106980961015, + "grad_norm": 0.10869650878505681, + "learning_rate": 2.468897360648712e-06, + "loss": 0.8553, + "step": 9914 + }, + { + "epoch": 4.49456029011786, + "grad_norm": 0.09537393102872385, + "learning_rate": 2.4645207805273863e-06, + "loss": 0.8684, + "step": 9915 + }, + { + "epoch": 4.495013599274706, + "grad_norm": 0.08643480611068434, + "learning_rate": 2.460147959694612e-06, + "loss": 0.8752, + "step": 9916 + }, + { + "epoch": 4.49546690843155, + "grad_norm": 0.08745585491739184, + "learning_rate": 2.4557788985883548e-06, + "loss": 0.8525, + "step": 9917 + }, + { + "epoch": 4.495920217588395, + "grad_norm": 0.0959486386950686, + "learning_rate": 2.4514135976461794e-06, + "loss": 0.8505, + "step": 9918 + }, + { + "epoch": 4.4963735267452405, + "grad_norm": 0.10100671487114553, + "learning_rate": 2.447052057305279e-06, + "loss": 0.8652, + "step": 9919 + }, + { + "epoch": 4.496826835902085, + "grad_norm": 0.08504505514048064, + "learning_rate": 2.442694278002491e-06, + "loss": 0.8694, + "step": 9920 + }, + { + "epoch": 4.49728014505893, + "grad_norm": 0.09543964269934767, + "learning_rate": 2.438340260174239e-06, + "loss": 0.8517, + "step": 9921 + }, + { + "epoch": 4.497733454215775, + "grad_norm": 0.09572817545821964, + "learning_rate": 2.43399000425661e-06, + "loss": 0.8686, + "step": 9922 + }, + { + "epoch": 4.49818676337262, + "grad_norm": 0.08866545211690545, + "learning_rate": 2.4296435106852823e-06, + "loss": 0.8615, + "step": 9923 + }, + { + "epoch": 4.498640072529465, + "grad_norm": 0.0866491316544971, + "learning_rate": 2.4253007798955784e-06, + "loss": 0.8589, + "step": 9924 + }, + { + "epoch": 4.49909338168631, + "grad_norm": 0.09587632316596524, + "learning_rate": 2.4209618123224266e-06, + "loss": 0.8348, + "step": 9925 + }, + { + "epoch": 4.499546690843155, + "grad_norm": 0.09603532226773209, + "learning_rate": 2.416626608400403e-06, + "loss": 0.8753, + "step": 9926 + }, + { + "epoch": 4.5, + "grad_norm": 0.08992806107144233, + "learning_rate": 2.4122951685636674e-06, + "loss": 0.8645, + "step": 9927 + }, + { + "epoch": 4.500453309156845, + "grad_norm": 0.09104356410255758, + "learning_rate": 2.4079674932460463e-06, + "loss": 0.845, + "step": 9928 + }, + { + "epoch": 4.50090661831369, + "grad_norm": 0.09568643891992576, + "learning_rate": 2.4036435828809525e-06, + "loss": 0.8529, + "step": 9929 + }, + { + "epoch": 4.501359927470535, + "grad_norm": 0.09337038710458347, + "learning_rate": 2.399323437901457e-06, + "loss": 0.8799, + "step": 9930 + }, + { + "epoch": 4.50181323662738, + "grad_norm": 0.09246855404604477, + "learning_rate": 2.3950070587402195e-06, + "loss": 0.8551, + "step": 9931 + }, + { + "epoch": 4.502266545784225, + "grad_norm": 0.09333024606969682, + "learning_rate": 2.3906944458295467e-06, + "loss": 0.8377, + "step": 9932 + }, + { + "epoch": 4.50271985494107, + "grad_norm": 0.08856359895620843, + "learning_rate": 2.3863855996013596e-06, + "loss": 0.8678, + "step": 9933 + }, + { + "epoch": 4.503173164097915, + "grad_norm": 0.08984919319188935, + "learning_rate": 2.382080520487193e-06, + "loss": 0.8415, + "step": 9934 + }, + { + "epoch": 4.5036264732547595, + "grad_norm": 0.10759497780168473, + "learning_rate": 2.3777792089182272e-06, + "loss": 0.8497, + "step": 9935 + }, + { + "epoch": 4.504079782411605, + "grad_norm": 0.09177803189275212, + "learning_rate": 2.3734816653252324e-06, + "loss": 0.8543, + "step": 9936 + }, + { + "epoch": 4.50453309156845, + "grad_norm": 0.09256211977823034, + "learning_rate": 2.3691878901386424e-06, + "loss": 0.8413, + "step": 9937 + }, + { + "epoch": 4.504986400725294, + "grad_norm": 0.08910620754482523, + "learning_rate": 2.3648978837884683e-06, + "loss": 0.8578, + "step": 9938 + }, + { + "epoch": 4.50543970988214, + "grad_norm": 0.09285606223680866, + "learning_rate": 2.3606116467043892e-06, + "loss": 0.847, + "step": 9939 + }, + { + "epoch": 4.505893019038984, + "grad_norm": 0.09498543629576794, + "learning_rate": 2.356329179315666e-06, + "loss": 0.8611, + "step": 9940 + }, + { + "epoch": 4.506346328195829, + "grad_norm": 0.1024025056486236, + "learning_rate": 2.3520504820512134e-06, + "loss": 0.8537, + "step": 9941 + }, + { + "epoch": 4.506799637352675, + "grad_norm": 0.09668901432024933, + "learning_rate": 2.3477755553395464e-06, + "loss": 0.847, + "step": 9942 + }, + { + "epoch": 4.507252946509519, + "grad_norm": 0.09909065593428633, + "learning_rate": 2.3435043996088202e-06, + "loss": 0.8362, + "step": 9943 + }, + { + "epoch": 4.507706255666364, + "grad_norm": 0.11090588664770572, + "learning_rate": 2.3392370152867995e-06, + "loss": 0.8447, + "step": 9944 + }, + { + "epoch": 4.50815956482321, + "grad_norm": 0.10118775541915277, + "learning_rate": 2.334973402800871e-06, + "loss": 0.8425, + "step": 9945 + }, + { + "epoch": 4.508612873980054, + "grad_norm": 0.08994266451504437, + "learning_rate": 2.3307135625780532e-06, + "loss": 0.8602, + "step": 9946 + }, + { + "epoch": 4.509066183136899, + "grad_norm": 0.08355942047635014, + "learning_rate": 2.3264574950449827e-06, + "loss": 0.842, + "step": 9947 + }, + { + "epoch": 4.5095194922937445, + "grad_norm": 0.08918165677133595, + "learning_rate": 2.3222052006279138e-06, + "loss": 0.8715, + "step": 9948 + }, + { + "epoch": 4.509972801450589, + "grad_norm": 0.08529393024316442, + "learning_rate": 2.31795667975272e-06, + "loss": 0.8519, + "step": 9949 + }, + { + "epoch": 4.510426110607434, + "grad_norm": 0.09471859972149724, + "learning_rate": 2.3137119328449175e-06, + "loss": 0.8721, + "step": 9950 + }, + { + "epoch": 4.510879419764279, + "grad_norm": 0.09385411234033919, + "learning_rate": 2.3094709603296163e-06, + "loss": 0.8414, + "step": 9951 + }, + { + "epoch": 4.511332728921124, + "grad_norm": 0.09060961897853108, + "learning_rate": 2.3052337626315734e-06, + "loss": 0.8626, + "step": 9952 + }, + { + "epoch": 4.511786038077969, + "grad_norm": 0.08988474266028963, + "learning_rate": 2.301000340175148e-06, + "loss": 0.8503, + "step": 9953 + }, + { + "epoch": 4.512239347234814, + "grad_norm": 0.11719209443495736, + "learning_rate": 2.2967706933843425e-06, + "loss": 0.8569, + "step": 9954 + }, + { + "epoch": 4.512692656391659, + "grad_norm": 0.09136732342815328, + "learning_rate": 2.2925448226827473e-06, + "loss": 0.8373, + "step": 9955 + }, + { + "epoch": 4.513145965548504, + "grad_norm": 0.0908556276830977, + "learning_rate": 2.2883227284936195e-06, + "loss": 0.8497, + "step": 9956 + }, + { + "epoch": 4.513599274705349, + "grad_norm": 0.09466763982366468, + "learning_rate": 2.2841044112397936e-06, + "loss": 0.8626, + "step": 9957 + }, + { + "epoch": 4.514052583862194, + "grad_norm": 0.08727694116885013, + "learning_rate": 2.2798898713437634e-06, + "loss": 0.8422, + "step": 9958 + }, + { + "epoch": 4.514505893019039, + "grad_norm": 0.08699802081633767, + "learning_rate": 2.275679109227622e-06, + "loss": 0.8561, + "step": 9959 + }, + { + "epoch": 4.514959202175884, + "grad_norm": 0.08760163650280263, + "learning_rate": 2.2714721253130814e-06, + "loss": 0.8566, + "step": 9960 + }, + { + "epoch": 4.515412511332729, + "grad_norm": 0.0831144893555214, + "learning_rate": 2.2672689200214924e-06, + "loss": 0.8764, + "step": 9961 + }, + { + "epoch": 4.515865820489574, + "grad_norm": 0.08675983906366062, + "learning_rate": 2.263069493773813e-06, + "loss": 0.8599, + "step": 9962 + }, + { + "epoch": 4.516319129646419, + "grad_norm": 0.08698924138084026, + "learning_rate": 2.2588738469906303e-06, + "loss": 0.8535, + "step": 9963 + }, + { + "epoch": 4.5167724388032635, + "grad_norm": 0.08971310041935335, + "learning_rate": 2.2546819800921505e-06, + "loss": 0.838, + "step": 9964 + }, + { + "epoch": 4.517225747960109, + "grad_norm": 0.1054884326272803, + "learning_rate": 2.2504938934982113e-06, + "loss": 0.8497, + "step": 9965 + }, + { + "epoch": 4.517679057116954, + "grad_norm": 0.08688767090743625, + "learning_rate": 2.2463095876282415e-06, + "loss": 0.8557, + "step": 9966 + }, + { + "epoch": 4.518132366273798, + "grad_norm": 0.08647185991539887, + "learning_rate": 2.2421290629013327e-06, + "loss": 0.8382, + "step": 9967 + }, + { + "epoch": 4.518585675430644, + "grad_norm": 0.1215642324301731, + "learning_rate": 2.237952319736154e-06, + "loss": 0.8537, + "step": 9968 + }, + { + "epoch": 4.519038984587489, + "grad_norm": 0.10213479968781866, + "learning_rate": 2.233779358551038e-06, + "loss": 0.8809, + "step": 9969 + }, + { + "epoch": 4.519492293744333, + "grad_norm": 0.09865996918530962, + "learning_rate": 2.229610179763908e-06, + "loss": 0.8358, + "step": 9970 + }, + { + "epoch": 4.519945602901179, + "grad_norm": 0.09150301714369372, + "learning_rate": 2.2254447837923322e-06, + "loss": 0.8475, + "step": 9971 + }, + { + "epoch": 4.520398912058024, + "grad_norm": 0.1051984843142184, + "learning_rate": 2.22128317105347e-06, + "loss": 0.8542, + "step": 9972 + }, + { + "epoch": 4.520852221214868, + "grad_norm": 0.09587756334259054, + "learning_rate": 2.2171253419641304e-06, + "loss": 0.8521, + "step": 9973 + }, + { + "epoch": 4.521305530371714, + "grad_norm": 0.09638633114021006, + "learning_rate": 2.212971296940736e-06, + "loss": 0.8575, + "step": 9974 + }, + { + "epoch": 4.521758839528559, + "grad_norm": 0.09318272163464963, + "learning_rate": 2.208821036399309e-06, + "loss": 0.8876, + "step": 9975 + }, + { + "epoch": 4.522212148685403, + "grad_norm": 0.08879226504097133, + "learning_rate": 2.204674560755531e-06, + "loss": 0.8669, + "step": 9976 + }, + { + "epoch": 4.5226654578422485, + "grad_norm": 0.1052145685132351, + "learning_rate": 2.2005318704246648e-06, + "loss": 0.8615, + "step": 9977 + }, + { + "epoch": 4.523118766999094, + "grad_norm": 0.10634789938206603, + "learning_rate": 2.196392965821632e-06, + "loss": 0.8637, + "step": 9978 + }, + { + "epoch": 4.523572076155938, + "grad_norm": 0.08288722567366846, + "learning_rate": 2.1922578473609367e-06, + "loss": 0.8433, + "step": 9979 + }, + { + "epoch": 4.524025385312783, + "grad_norm": 0.08679611448613803, + "learning_rate": 2.188126515456741e-06, + "loss": 0.8471, + "step": 9980 + }, + { + "epoch": 4.524478694469629, + "grad_norm": 0.09442166604677114, + "learning_rate": 2.183998970522798e-06, + "loss": 0.8636, + "step": 9981 + }, + { + "epoch": 4.524932003626473, + "grad_norm": 0.0927174157241285, + "learning_rate": 2.179875212972502e-06, + "loss": 0.8636, + "step": 9982 + }, + { + "epoch": 4.525385312783318, + "grad_norm": 0.08371483705479638, + "learning_rate": 2.1757552432188466e-06, + "loss": 0.8596, + "step": 9983 + }, + { + "epoch": 4.525838621940164, + "grad_norm": 0.08952303288146139, + "learning_rate": 2.17163906167448e-06, + "loss": 0.8332, + "step": 9984 + }, + { + "epoch": 4.526291931097008, + "grad_norm": 0.08808130104040439, + "learning_rate": 2.167526668751623e-06, + "loss": 0.8536, + "step": 9985 + }, + { + "epoch": 4.526745240253853, + "grad_norm": 0.0932044013746318, + "learning_rate": 2.1634180648621684e-06, + "loss": 0.8548, + "step": 9986 + }, + { + "epoch": 4.5271985494106985, + "grad_norm": 0.08879657688042748, + "learning_rate": 2.159313250417583e-06, + "loss": 0.8542, + "step": 9987 + }, + { + "epoch": 4.527651858567543, + "grad_norm": 0.08839936791411411, + "learning_rate": 2.1552122258289953e-06, + "loss": 0.8385, + "step": 9988 + }, + { + "epoch": 4.528105167724388, + "grad_norm": 0.08801926254362433, + "learning_rate": 2.1511149915071215e-06, + "loss": 0.8507, + "step": 9989 + }, + { + "epoch": 4.5285584768812335, + "grad_norm": 0.08874390628897406, + "learning_rate": 2.1470215478623226e-06, + "loss": 0.8612, + "step": 9990 + }, + { + "epoch": 4.529011786038078, + "grad_norm": 0.08181295104347748, + "learning_rate": 2.142931895304563e-06, + "loss": 0.8684, + "step": 9991 + }, + { + "epoch": 4.529465095194923, + "grad_norm": 0.09117087565204406, + "learning_rate": 2.1388460342434314e-06, + "loss": 0.8174, + "step": 9992 + }, + { + "epoch": 4.529918404351768, + "grad_norm": 0.09536637671104295, + "learning_rate": 2.1347639650881426e-06, + "loss": 0.8449, + "step": 9993 + }, + { + "epoch": 4.530371713508613, + "grad_norm": 0.08574773148419682, + "learning_rate": 2.1306856882475246e-06, + "loss": 0.8584, + "step": 9994 + }, + { + "epoch": 4.530825022665458, + "grad_norm": 0.08748878813887043, + "learning_rate": 2.1266112041300336e-06, + "loss": 0.8662, + "step": 9995 + }, + { + "epoch": 4.531278331822302, + "grad_norm": 0.09162737152609456, + "learning_rate": 2.1225405131437338e-06, + "loss": 0.8456, + "step": 9996 + }, + { + "epoch": 4.531731640979148, + "grad_norm": 0.1046608898728702, + "learning_rate": 2.1184736156963304e-06, + "loss": 0.8684, + "step": 9997 + }, + { + "epoch": 4.532184950135993, + "grad_norm": 0.09497950067611681, + "learning_rate": 2.1144105121951154e-06, + "loss": 0.8439, + "step": 9998 + }, + { + "epoch": 4.532638259292837, + "grad_norm": 0.09666101363613044, + "learning_rate": 2.1103512030470386e-06, + "loss": 0.8621, + "step": 9999 + }, + { + "epoch": 4.533091568449683, + "grad_norm": 0.09711722282693266, + "learning_rate": 2.106295688658646e-06, + "loss": 0.8488, + "step": 10000 + }, + { + "epoch": 4.533544877606528, + "grad_norm": 0.08729333434659382, + "learning_rate": 2.102243969436102e-06, + "loss": 0.848, + "step": 10001 + }, + { + "epoch": 4.533998186763372, + "grad_norm": 0.08423367374174692, + "learning_rate": 2.098196045785206e-06, + "loss": 0.8504, + "step": 10002 + }, + { + "epoch": 4.534451495920218, + "grad_norm": 0.0948082207878607, + "learning_rate": 2.0941519181113714e-06, + "loss": 0.8514, + "step": 10003 + }, + { + "epoch": 4.534904805077063, + "grad_norm": 0.09267509094849999, + "learning_rate": 2.0901115868196166e-06, + "loss": 0.8396, + "step": 10004 + }, + { + "epoch": 4.535358114233907, + "grad_norm": 0.09100993406029587, + "learning_rate": 2.0860750523146087e-06, + "loss": 0.8422, + "step": 10005 + }, + { + "epoch": 4.5358114233907525, + "grad_norm": 0.08721043185057882, + "learning_rate": 2.082042315000612e-06, + "loss": 0.8394, + "step": 10006 + }, + { + "epoch": 4.536264732547598, + "grad_norm": 0.10310248664001152, + "learning_rate": 2.078013375281507e-06, + "loss": 0.8604, + "step": 10007 + }, + { + "epoch": 4.536718041704442, + "grad_norm": 0.10123246635499342, + "learning_rate": 2.0739882335608243e-06, + "loss": 0.8407, + "step": 10008 + }, + { + "epoch": 4.537171350861287, + "grad_norm": 0.08994879252690524, + "learning_rate": 2.0699668902416725e-06, + "loss": 0.8586, + "step": 10009 + }, + { + "epoch": 4.537624660018133, + "grad_norm": 0.09033664999777427, + "learning_rate": 2.065949345726819e-06, + "loss": 0.8626, + "step": 10010 + }, + { + "epoch": 4.538077969174977, + "grad_norm": 0.09175349004721876, + "learning_rate": 2.0619356004186165e-06, + "loss": 0.849, + "step": 10011 + }, + { + "epoch": 4.538531278331822, + "grad_norm": 0.09668073970427295, + "learning_rate": 2.0579256547190686e-06, + "loss": 0.8507, + "step": 10012 + }, + { + "epoch": 4.538984587488668, + "grad_norm": 0.09605379655310729, + "learning_rate": 2.0539195090297735e-06, + "loss": 0.8299, + "step": 10013 + }, + { + "epoch": 4.539437896645512, + "grad_norm": 0.09898948404113457, + "learning_rate": 2.049917163751962e-06, + "loss": 0.8589, + "step": 10014 + }, + { + "epoch": 4.539891205802357, + "grad_norm": 0.08449537492662608, + "learning_rate": 2.0459186192864776e-06, + "loss": 0.8631, + "step": 10015 + }, + { + "epoch": 4.5403445149592025, + "grad_norm": 0.08892992519225416, + "learning_rate": 2.041923876033791e-06, + "loss": 0.8559, + "step": 10016 + }, + { + "epoch": 4.540797824116047, + "grad_norm": 0.08716083029207239, + "learning_rate": 2.037932934393987e-06, + "loss": 0.8584, + "step": 10017 + }, + { + "epoch": 4.541251133272892, + "grad_norm": 0.09815506997304142, + "learning_rate": 2.033945794766763e-06, + "loss": 0.8545, + "step": 10018 + }, + { + "epoch": 4.541704442429737, + "grad_norm": 0.09608227115812298, + "learning_rate": 2.0299624575514486e-06, + "loss": 0.8454, + "step": 10019 + }, + { + "epoch": 4.542157751586582, + "grad_norm": 0.0810676374555769, + "learning_rate": 2.0259829231469875e-06, + "loss": 0.8529, + "step": 10020 + }, + { + "epoch": 4.542611060743427, + "grad_norm": 0.09429113060246071, + "learning_rate": 2.0220071919519403e-06, + "loss": 0.8478, + "step": 10021 + }, + { + "epoch": 4.5430643699002715, + "grad_norm": 0.09541133295996454, + "learning_rate": 2.0180352643644906e-06, + "loss": 0.854, + "step": 10022 + }, + { + "epoch": 4.543517679057117, + "grad_norm": 0.09878392461958695, + "learning_rate": 2.0140671407824362e-06, + "loss": 0.8588, + "step": 10023 + }, + { + "epoch": 4.543970988213962, + "grad_norm": 0.09896209069979557, + "learning_rate": 2.0101028216031883e-06, + "loss": 0.846, + "step": 10024 + }, + { + "epoch": 4.544424297370806, + "grad_norm": 0.10766380490447991, + "learning_rate": 2.0061423072238018e-06, + "loss": 0.8676, + "step": 10025 + }, + { + "epoch": 4.544877606527652, + "grad_norm": 0.08763399857434481, + "learning_rate": 2.00218559804092e-06, + "loss": 0.8582, + "step": 10026 + }, + { + "epoch": 4.545330915684497, + "grad_norm": 0.08605705601052795, + "learning_rate": 1.9982326944508257e-06, + "loss": 0.8621, + "step": 10027 + }, + { + "epoch": 4.545784224841341, + "grad_norm": 0.09771019211573952, + "learning_rate": 1.9942835968494113e-06, + "loss": 0.8555, + "step": 10028 + }, + { + "epoch": 4.546237533998187, + "grad_norm": 0.098994017180664, + "learning_rate": 1.9903383056321913e-06, + "loss": 0.8329, + "step": 10029 + }, + { + "epoch": 4.546690843155032, + "grad_norm": 0.08300346128114218, + "learning_rate": 1.9863968211942987e-06, + "loss": 0.8481, + "step": 10030 + }, + { + "epoch": 4.547144152311876, + "grad_norm": 0.09147442481897661, + "learning_rate": 1.982459143930484e-06, + "loss": 0.8565, + "step": 10031 + }, + { + "epoch": 4.547597461468722, + "grad_norm": 0.09723039897449261, + "learning_rate": 1.9785252742351213e-06, + "loss": 0.8325, + "step": 10032 + }, + { + "epoch": 4.548050770625567, + "grad_norm": 0.08746890677234832, + "learning_rate": 1.974595212502188e-06, + "loss": 0.8519, + "step": 10033 + }, + { + "epoch": 4.548504079782411, + "grad_norm": 0.0942570750133531, + "learning_rate": 1.970668959125308e-06, + "loss": 0.846, + "step": 10034 + }, + { + "epoch": 4.5489573889392565, + "grad_norm": 0.08749974771803762, + "learning_rate": 1.9667465144976904e-06, + "loss": 0.8651, + "step": 10035 + }, + { + "epoch": 4.549410698096102, + "grad_norm": 0.08654193299348799, + "learning_rate": 1.96282787901219e-06, + "loss": 0.8727, + "step": 10036 + }, + { + "epoch": 4.549864007252946, + "grad_norm": 0.08609272126673374, + "learning_rate": 1.9589130530612623e-06, + "loss": 0.8656, + "step": 10037 + }, + { + "epoch": 4.550317316409791, + "grad_norm": 0.08822739912684695, + "learning_rate": 1.9550020370370014e-06, + "loss": 0.8379, + "step": 10038 + }, + { + "epoch": 4.550770625566637, + "grad_norm": 0.08045521843600137, + "learning_rate": 1.9510948313310906e-06, + "loss": 0.855, + "step": 10039 + }, + { + "epoch": 4.551223934723481, + "grad_norm": 0.08936604544437218, + "learning_rate": 1.947191436334861e-06, + "loss": 0.8739, + "step": 10040 + }, + { + "epoch": 4.551677243880326, + "grad_norm": 0.08191426043471312, + "learning_rate": 1.9432918524392397e-06, + "loss": 0.8379, + "step": 10041 + }, + { + "epoch": 4.552130553037172, + "grad_norm": 0.10046705940835544, + "learning_rate": 1.939396080034799e-06, + "loss": 0.848, + "step": 10042 + }, + { + "epoch": 4.552583862194016, + "grad_norm": 0.09197975275096781, + "learning_rate": 1.9355041195116843e-06, + "loss": 0.8426, + "step": 10043 + }, + { + "epoch": 4.553037171350861, + "grad_norm": 0.08649895844694015, + "learning_rate": 1.931615971259713e-06, + "loss": 0.8426, + "step": 10044 + }, + { + "epoch": 4.5534904805077066, + "grad_norm": 0.0811151696066973, + "learning_rate": 1.9277316356682707e-06, + "loss": 0.8479, + "step": 10045 + }, + { + "epoch": 4.553943789664551, + "grad_norm": 0.09160743040303453, + "learning_rate": 1.923851113126407e-06, + "loss": 0.8516, + "step": 10046 + }, + { + "epoch": 4.554397098821396, + "grad_norm": 0.09671221710650431, + "learning_rate": 1.9199744040227574e-06, + "loss": 0.8731, + "step": 10047 + }, + { + "epoch": 4.5548504079782415, + "grad_norm": 0.08623192626812172, + "learning_rate": 1.91610150874558e-06, + "loss": 0.8536, + "step": 10048 + }, + { + "epoch": 4.555303717135086, + "grad_norm": 0.0986984442199136, + "learning_rate": 1.9122324276827654e-06, + "loss": 0.849, + "step": 10049 + }, + { + "epoch": 4.555757026291931, + "grad_norm": 0.0987865478280844, + "learning_rate": 1.9083671612218113e-06, + "loss": 0.8348, + "step": 10050 + }, + { + "epoch": 4.556210335448776, + "grad_norm": 0.07597161001056688, + "learning_rate": 1.90450570974984e-06, + "loss": 0.8516, + "step": 10051 + }, + { + "epoch": 4.556663644605621, + "grad_norm": 0.08254336160866085, + "learning_rate": 1.9006480736535726e-06, + "loss": 0.8534, + "step": 10052 + }, + { + "epoch": 4.557116953762466, + "grad_norm": 0.08562130208300668, + "learning_rate": 1.8967942533193806e-06, + "loss": 0.8581, + "step": 10053 + }, + { + "epoch": 4.557570262919311, + "grad_norm": 0.09401949511519495, + "learning_rate": 1.8929442491332218e-06, + "loss": 0.8516, + "step": 10054 + }, + { + "epoch": 4.558023572076156, + "grad_norm": 0.08915024427272739, + "learning_rate": 1.8890980614806987e-06, + "loss": 0.8358, + "step": 10055 + }, + { + "epoch": 4.558476881233001, + "grad_norm": 0.08455998279956016, + "learning_rate": 1.885255690747001e-06, + "loss": 0.8417, + "step": 10056 + }, + { + "epoch": 4.558930190389846, + "grad_norm": 0.07884297759907953, + "learning_rate": 1.8814171373169721e-06, + "loss": 0.8645, + "step": 10057 + }, + { + "epoch": 4.559383499546691, + "grad_norm": 0.09181474996538126, + "learning_rate": 1.877582401575042e-06, + "loss": 0.8502, + "step": 10058 + }, + { + "epoch": 4.559836808703536, + "grad_norm": 0.09481354838487428, + "learning_rate": 1.8737514839052729e-06, + "loss": 0.8492, + "step": 10059 + }, + { + "epoch": 4.560290117860381, + "grad_norm": 0.09632425569048796, + "learning_rate": 1.8699243846913439e-06, + "loss": 0.8451, + "step": 10060 + }, + { + "epoch": 4.560743427017226, + "grad_norm": 0.08566952458605781, + "learning_rate": 1.8661011043165534e-06, + "loss": 0.8476, + "step": 10061 + }, + { + "epoch": 4.561196736174071, + "grad_norm": 0.09014172874216969, + "learning_rate": 1.8622816431638125e-06, + "loss": 0.8556, + "step": 10062 + }, + { + "epoch": 4.561650045330916, + "grad_norm": 0.09091758258950092, + "learning_rate": 1.8584660016156419e-06, + "loss": 0.8829, + "step": 10063 + }, + { + "epoch": 4.5621033544877605, + "grad_norm": 0.08250126649811937, + "learning_rate": 1.854654180054203e-06, + "loss": 0.8554, + "step": 10064 + }, + { + "epoch": 4.562556663644606, + "grad_norm": 0.08290545206589592, + "learning_rate": 1.8508461788612476e-06, + "loss": 0.85, + "step": 10065 + }, + { + "epoch": 4.563009972801451, + "grad_norm": 0.08758652329012856, + "learning_rate": 1.8470419984181731e-06, + "loss": 0.8595, + "step": 10066 + }, + { + "epoch": 4.563463281958295, + "grad_norm": 0.08670210337555638, + "learning_rate": 1.8432416391059638e-06, + "loss": 0.8437, + "step": 10067 + }, + { + "epoch": 4.563916591115141, + "grad_norm": 0.08965439341837557, + "learning_rate": 1.8394451013052528e-06, + "loss": 0.835, + "step": 10068 + }, + { + "epoch": 4.564369900271986, + "grad_norm": 0.0849515404269043, + "learning_rate": 1.8356523853962605e-06, + "loss": 0.8457, + "step": 10069 + }, + { + "epoch": 4.56482320942883, + "grad_norm": 0.09177284768935207, + "learning_rate": 1.8318634917588517e-06, + "loss": 0.8474, + "step": 10070 + }, + { + "epoch": 4.565276518585676, + "grad_norm": 0.0880819873486341, + "learning_rate": 1.8280784207724832e-06, + "loss": 0.8865, + "step": 10071 + }, + { + "epoch": 4.565729827742521, + "grad_norm": 0.08474404411627372, + "learning_rate": 1.8242971728162517e-06, + "loss": 0.8723, + "step": 10072 + }, + { + "epoch": 4.566183136899365, + "grad_norm": 0.09073404449517328, + "learning_rate": 1.8205197482688498e-06, + "loss": 0.8456, + "step": 10073 + }, + { + "epoch": 4.5666364460562106, + "grad_norm": 0.08358540785382085, + "learning_rate": 1.8167461475086013e-06, + "loss": 0.8528, + "step": 10074 + }, + { + "epoch": 4.567089755213055, + "grad_norm": 0.09108436083524668, + "learning_rate": 1.8129763709134485e-06, + "loss": 0.865, + "step": 10075 + }, + { + "epoch": 4.5675430643699, + "grad_norm": 0.09487157429627974, + "learning_rate": 1.8092104188609383e-06, + "loss": 0.8566, + "step": 10076 + }, + { + "epoch": 4.5679963735267455, + "grad_norm": 0.0896281896892985, + "learning_rate": 1.805448291728249e-06, + "loss": 0.8684, + "step": 10077 + }, + { + "epoch": 4.56844968268359, + "grad_norm": 0.0805207056496783, + "learning_rate": 1.8016899898921635e-06, + "loss": 0.8689, + "step": 10078 + }, + { + "epoch": 4.568902991840435, + "grad_norm": 0.08078270999477344, + "learning_rate": 1.7979355137290878e-06, + "loss": 0.8567, + "step": 10079 + }, + { + "epoch": 4.56935630099728, + "grad_norm": 0.08404218474833304, + "learning_rate": 1.7941848636150494e-06, + "loss": 0.8461, + "step": 10080 + }, + { + "epoch": 4.569809610154125, + "grad_norm": 0.08680068599145771, + "learning_rate": 1.790438039925677e-06, + "loss": 0.8711, + "step": 10081 + }, + { + "epoch": 4.57026291931097, + "grad_norm": 0.0849247108642712, + "learning_rate": 1.7866950430362262e-06, + "loss": 0.8571, + "step": 10082 + }, + { + "epoch": 4.570716228467815, + "grad_norm": 0.0911398503411376, + "learning_rate": 1.7829558733215835e-06, + "loss": 0.8468, + "step": 10083 + }, + { + "epoch": 4.57116953762466, + "grad_norm": 0.08068895171265865, + "learning_rate": 1.7792205311562184e-06, + "loss": 0.8389, + "step": 10084 + }, + { + "epoch": 4.571622846781505, + "grad_norm": 0.08775265432596205, + "learning_rate": 1.7754890169142492e-06, + "loss": 0.8475, + "step": 10085 + }, + { + "epoch": 4.57207615593835, + "grad_norm": 0.08511024262202724, + "learning_rate": 1.771761330969395e-06, + "loss": 0.8465, + "step": 10086 + }, + { + "epoch": 4.572529465095195, + "grad_norm": 0.16991907997346517, + "learning_rate": 1.7680374736949968e-06, + "loss": 0.842, + "step": 10087 + }, + { + "epoch": 4.57298277425204, + "grad_norm": 0.0865733527388611, + "learning_rate": 1.7643174454640056e-06, + "loss": 0.8497, + "step": 10088 + }, + { + "epoch": 4.573436083408885, + "grad_norm": 0.08493591274617879, + "learning_rate": 1.760601246648994e-06, + "loss": 0.8368, + "step": 10089 + }, + { + "epoch": 4.57388939256573, + "grad_norm": 0.09432449467838346, + "learning_rate": 1.7568888776221538e-06, + "loss": 0.8554, + "step": 10090 + }, + { + "epoch": 4.574342701722575, + "grad_norm": 0.08771871175188142, + "learning_rate": 1.7531803387552805e-06, + "loss": 0.8423, + "step": 10091 + }, + { + "epoch": 4.57479601087942, + "grad_norm": 0.08434088989473915, + "learning_rate": 1.7494756304198057e-06, + "loss": 0.8641, + "step": 10092 + }, + { + "epoch": 4.5752493200362645, + "grad_norm": 0.08373335018987697, + "learning_rate": 1.7457747529867618e-06, + "loss": 0.8601, + "step": 10093 + }, + { + "epoch": 4.57570262919311, + "grad_norm": 0.08794850383075574, + "learning_rate": 1.7420777068268035e-06, + "loss": 0.8781, + "step": 10094 + }, + { + "epoch": 4.576155938349955, + "grad_norm": 0.08368563664395856, + "learning_rate": 1.738384492310199e-06, + "loss": 0.8599, + "step": 10095 + }, + { + "epoch": 4.576609247506799, + "grad_norm": 0.07894593581179006, + "learning_rate": 1.7346951098068432e-06, + "loss": 0.8659, + "step": 10096 + }, + { + "epoch": 4.577062556663645, + "grad_norm": 0.08783714545670218, + "learning_rate": 1.7310095596862232e-06, + "loss": 0.8502, + "step": 10097 + }, + { + "epoch": 4.577515865820489, + "grad_norm": 0.0976900677073141, + "learning_rate": 1.7273278423174745e-06, + "loss": 0.8455, + "step": 10098 + }, + { + "epoch": 4.577969174977334, + "grad_norm": 0.086406729896526, + "learning_rate": 1.72364995806932e-06, + "loss": 0.8557, + "step": 10099 + }, + { + "epoch": 4.57842248413418, + "grad_norm": 0.08134342011742009, + "learning_rate": 1.7199759073101185e-06, + "loss": 0.8598, + "step": 10100 + }, + { + "epoch": 4.578875793291024, + "grad_norm": 0.09055474095891457, + "learning_rate": 1.7163056904078246e-06, + "loss": 0.8751, + "step": 10101 + }, + { + "epoch": 4.579329102447869, + "grad_norm": 0.08960067633896408, + "learning_rate": 1.7126393077300375e-06, + "loss": 0.8427, + "step": 10102 + }, + { + "epoch": 4.5797824116047146, + "grad_norm": 0.08864174208363182, + "learning_rate": 1.7089767596439432e-06, + "loss": 0.8423, + "step": 10103 + }, + { + "epoch": 4.580235720761559, + "grad_norm": 0.09371150738067711, + "learning_rate": 1.70531804651636e-06, + "loss": 0.86, + "step": 10104 + }, + { + "epoch": 4.580689029918404, + "grad_norm": 0.09923982126415586, + "learning_rate": 1.7016631687137275e-06, + "loss": 0.8388, + "step": 10105 + }, + { + "epoch": 4.5811423390752495, + "grad_norm": 0.08314817414545482, + "learning_rate": 1.6980121266020776e-06, + "loss": 0.8685, + "step": 10106 + }, + { + "epoch": 4.581595648232094, + "grad_norm": 0.08252084303937449, + "learning_rate": 1.694364920547087e-06, + "loss": 0.8637, + "step": 10107 + }, + { + "epoch": 4.582048957388939, + "grad_norm": 0.09293267728311086, + "learning_rate": 1.6907215509140184e-06, + "loss": 0.8346, + "step": 10108 + }, + { + "epoch": 4.582502266545784, + "grad_norm": 0.08962752502203557, + "learning_rate": 1.687082018067785e-06, + "loss": 0.8423, + "step": 10109 + }, + { + "epoch": 4.582955575702629, + "grad_norm": 0.08069570775579914, + "learning_rate": 1.6834463223728814e-06, + "loss": 0.8498, + "step": 10110 + }, + { + "epoch": 4.583408884859474, + "grad_norm": 0.08588481616825189, + "learning_rate": 1.6798144641934432e-06, + "loss": 0.8579, + "step": 10111 + }, + { + "epoch": 4.583862194016319, + "grad_norm": 0.08723838507735372, + "learning_rate": 1.6761864438932017e-06, + "loss": 0.8556, + "step": 10112 + }, + { + "epoch": 4.584315503173164, + "grad_norm": 0.08709585809332979, + "learning_rate": 1.6725622618355286e-06, + "loss": 0.8619, + "step": 10113 + }, + { + "epoch": 4.584768812330009, + "grad_norm": 0.08625549600891659, + "learning_rate": 1.6689419183833822e-06, + "loss": 0.857, + "step": 10114 + }, + { + "epoch": 4.585222121486854, + "grad_norm": 0.07928119922505866, + "learning_rate": 1.6653254138993524e-06, + "loss": 0.8468, + "step": 10115 + }, + { + "epoch": 4.585675430643699, + "grad_norm": 0.08037077851899313, + "learning_rate": 1.661712748745652e-06, + "loss": 0.858, + "step": 10116 + }, + { + "epoch": 4.586128739800544, + "grad_norm": 0.0837954529469153, + "learning_rate": 1.6581039232840845e-06, + "loss": 0.852, + "step": 10117 + }, + { + "epoch": 4.586582048957389, + "grad_norm": 0.07981697526189449, + "learning_rate": 1.6544989378761078e-06, + "loss": 0.836, + "step": 10118 + }, + { + "epoch": 4.587035358114234, + "grad_norm": 0.08253990572974215, + "learning_rate": 1.6508977928827485e-06, + "loss": 0.871, + "step": 10119 + }, + { + "epoch": 4.587488667271079, + "grad_norm": 0.16650704870293173, + "learning_rate": 1.6473004886646826e-06, + "loss": 0.8891, + "step": 10120 + }, + { + "epoch": 4.587941976427924, + "grad_norm": 0.09382258710736553, + "learning_rate": 1.6437070255821863e-06, + "loss": 0.8628, + "step": 10121 + }, + { + "epoch": 4.5883952855847685, + "grad_norm": 0.09057227807446606, + "learning_rate": 1.6401174039951673e-06, + "loss": 0.861, + "step": 10122 + }, + { + "epoch": 4.588848594741614, + "grad_norm": 0.08727470198047381, + "learning_rate": 1.6365316242631158e-06, + "loss": 0.8416, + "step": 10123 + }, + { + "epoch": 4.589301903898459, + "grad_norm": 0.10115273962782441, + "learning_rate": 1.6329496867451799e-06, + "loss": 0.8583, + "step": 10124 + }, + { + "epoch": 4.589755213055303, + "grad_norm": 0.13619021960920705, + "learning_rate": 1.6293715918000863e-06, + "loss": 0.8652, + "step": 10125 + }, + { + "epoch": 4.590208522212149, + "grad_norm": 0.08488891751173376, + "learning_rate": 1.6257973397862015e-06, + "loss": 0.8676, + "step": 10126 + }, + { + "epoch": 4.590661831368994, + "grad_norm": 0.08785483767797686, + "learning_rate": 1.622226931061488e-06, + "loss": 0.8528, + "step": 10127 + }, + { + "epoch": 4.591115140525838, + "grad_norm": 0.09143006587668524, + "learning_rate": 1.61866036598354e-06, + "loss": 0.8551, + "step": 10128 + }, + { + "epoch": 4.591568449682684, + "grad_norm": 0.08863896594443875, + "learning_rate": 1.6150976449095602e-06, + "loss": 0.8607, + "step": 10129 + }, + { + "epoch": 4.592021758839529, + "grad_norm": 0.08837053328255794, + "learning_rate": 1.6115387681963568e-06, + "loss": 0.8432, + "step": 10130 + }, + { + "epoch": 4.592475067996373, + "grad_norm": 0.08222512663126724, + "learning_rate": 1.6079837362003692e-06, + "loss": 0.871, + "step": 10131 + }, + { + "epoch": 4.5929283771532186, + "grad_norm": 0.10021980374667976, + "learning_rate": 1.6044325492776414e-06, + "loss": 0.8597, + "step": 10132 + }, + { + "epoch": 4.593381686310064, + "grad_norm": 0.08631536707418888, + "learning_rate": 1.6008852077838399e-06, + "loss": 0.8427, + "step": 10133 + }, + { + "epoch": 4.593834995466908, + "grad_norm": 0.08986238623325982, + "learning_rate": 1.597341712074232e-06, + "loss": 0.8644, + "step": 10134 + }, + { + "epoch": 4.5942883046237535, + "grad_norm": 0.08343185216139067, + "learning_rate": 1.5938020625037199e-06, + "loss": 0.8433, + "step": 10135 + }, + { + "epoch": 4.594741613780599, + "grad_norm": 0.0800531218459978, + "learning_rate": 1.5902662594268025e-06, + "loss": 0.8374, + "step": 10136 + }, + { + "epoch": 4.595194922937443, + "grad_norm": 0.08088676928654276, + "learning_rate": 1.5867343031976056e-06, + "loss": 0.8515, + "step": 10137 + }, + { + "epoch": 4.595648232094288, + "grad_norm": 0.08964816472144767, + "learning_rate": 1.583206194169864e-06, + "loss": 0.8567, + "step": 10138 + }, + { + "epoch": 4.596101541251134, + "grad_norm": 0.09513886294366695, + "learning_rate": 1.579681932696926e-06, + "loss": 0.8573, + "step": 10139 + }, + { + "epoch": 4.596554850407978, + "grad_norm": 0.09336993567957788, + "learning_rate": 1.5761615191317535e-06, + "loss": 0.8582, + "step": 10140 + }, + { + "epoch": 4.597008159564823, + "grad_norm": 0.07908583759358426, + "learning_rate": 1.5726449538269361e-06, + "loss": 0.8554, + "step": 10141 + }, + { + "epoch": 4.597461468721669, + "grad_norm": 0.08232588773787851, + "learning_rate": 1.569132237134654e-06, + "loss": 0.8474, + "step": 10142 + }, + { + "epoch": 4.597914777878513, + "grad_norm": 0.08539993728002826, + "learning_rate": 1.5656233694067324e-06, + "loss": 0.8537, + "step": 10143 + }, + { + "epoch": 4.598368087035358, + "grad_norm": 0.08664278306774448, + "learning_rate": 1.5621183509945881e-06, + "loss": 0.8441, + "step": 10144 + }, + { + "epoch": 4.5988213961922035, + "grad_norm": 0.08087459996033926, + "learning_rate": 1.5586171822492514e-06, + "loss": 0.8536, + "step": 10145 + }, + { + "epoch": 4.599274705349048, + "grad_norm": 0.08851788747425218, + "learning_rate": 1.5551198635213838e-06, + "loss": 0.8445, + "step": 10146 + }, + { + "epoch": 4.599728014505893, + "grad_norm": 0.10297628608157389, + "learning_rate": 1.5516263951612476e-06, + "loss": 0.8652, + "step": 10147 + }, + { + "epoch": 4.6001813236627385, + "grad_norm": 0.08869810213845455, + "learning_rate": 1.5481367775187316e-06, + "loss": 0.8654, + "step": 10148 + }, + { + "epoch": 4.600634632819583, + "grad_norm": 0.08502556436242542, + "learning_rate": 1.5446510109433166e-06, + "loss": 0.8564, + "step": 10149 + }, + { + "epoch": 4.601087941976428, + "grad_norm": 0.08263295475507819, + "learning_rate": 1.5411690957841274e-06, + "loss": 0.8723, + "step": 10150 + }, + { + "epoch": 4.601541251133273, + "grad_norm": 0.08070726384899608, + "learning_rate": 1.5376910323898763e-06, + "loss": 0.8563, + "step": 10151 + }, + { + "epoch": 4.601994560290118, + "grad_norm": 0.08545237146160854, + "learning_rate": 1.5342168211089115e-06, + "loss": 0.8514, + "step": 10152 + }, + { + "epoch": 4.602447869446963, + "grad_norm": 0.09199236240200255, + "learning_rate": 1.5307464622891766e-06, + "loss": 0.8399, + "step": 10153 + }, + { + "epoch": 4.602901178603807, + "grad_norm": 0.08195092597209687, + "learning_rate": 1.5272799562782515e-06, + "loss": 0.855, + "step": 10154 + }, + { + "epoch": 4.603354487760653, + "grad_norm": 0.08106032907885387, + "learning_rate": 1.5238173034233027e-06, + "loss": 0.8487, + "step": 10155 + }, + { + "epoch": 4.603807796917498, + "grad_norm": 0.0887777504501776, + "learning_rate": 1.5203585040711289e-06, + "loss": 0.8622, + "step": 10156 + }, + { + "epoch": 4.604261106074342, + "grad_norm": 0.09418860367853135, + "learning_rate": 1.5169035585681412e-06, + "loss": 0.8684, + "step": 10157 + }, + { + "epoch": 4.604714415231188, + "grad_norm": 0.08395346350774018, + "learning_rate": 1.5134524672603657e-06, + "loss": 0.8502, + "step": 10158 + }, + { + "epoch": 4.605167724388033, + "grad_norm": 0.08034364660655895, + "learning_rate": 1.5100052304934277e-06, + "loss": 0.8634, + "step": 10159 + }, + { + "epoch": 4.605621033544877, + "grad_norm": 0.08830218909001233, + "learning_rate": 1.5065618486125933e-06, + "loss": 0.8446, + "step": 10160 + }, + { + "epoch": 4.606074342701723, + "grad_norm": 0.08581780632367071, + "learning_rate": 1.50312232196272e-06, + "loss": 0.8417, + "step": 10161 + }, + { + "epoch": 4.606527651858568, + "grad_norm": 0.08266443695663443, + "learning_rate": 1.499686650888279e-06, + "loss": 0.8573, + "step": 10162 + }, + { + "epoch": 4.606980961015412, + "grad_norm": 0.09144709464677653, + "learning_rate": 1.4962548357333774e-06, + "loss": 0.8692, + "step": 10163 + }, + { + "epoch": 4.6074342701722575, + "grad_norm": 0.08151706915226163, + "learning_rate": 1.4928268768417087e-06, + "loss": 0.8699, + "step": 10164 + }, + { + "epoch": 4.607887579329103, + "grad_norm": 0.08346174879283105, + "learning_rate": 1.4894027745566031e-06, + "loss": 0.8527, + "step": 10165 + }, + { + "epoch": 4.608340888485947, + "grad_norm": 0.08458220605429377, + "learning_rate": 1.4859825292209861e-06, + "loss": 0.8551, + "step": 10166 + }, + { + "epoch": 4.608794197642792, + "grad_norm": 0.07980752440597422, + "learning_rate": 1.4825661411774105e-06, + "loss": 0.8498, + "step": 10167 + }, + { + "epoch": 4.609247506799638, + "grad_norm": 0.07800363559019871, + "learning_rate": 1.4791536107680383e-06, + "loss": 0.8306, + "step": 10168 + }, + { + "epoch": 4.609700815956482, + "grad_norm": 0.0905519732834813, + "learning_rate": 1.4757449383346401e-06, + "loss": 0.8534, + "step": 10169 + }, + { + "epoch": 4.610154125113327, + "grad_norm": 0.08212595016420685, + "learning_rate": 1.4723401242186143e-06, + "loss": 0.8498, + "step": 10170 + }, + { + "epoch": 4.610607434270173, + "grad_norm": 0.0968354399954973, + "learning_rate": 1.4689391687609456e-06, + "loss": 0.8801, + "step": 10171 + }, + { + "epoch": 4.611060743427017, + "grad_norm": 0.087611438940547, + "learning_rate": 1.4655420723022685e-06, + "loss": 0.8605, + "step": 10172 + }, + { + "epoch": 4.611514052583862, + "grad_norm": 0.08168145242564682, + "learning_rate": 1.462148835182795e-06, + "loss": 0.862, + "step": 10173 + }, + { + "epoch": 4.6119673617407075, + "grad_norm": 0.08386673410581318, + "learning_rate": 1.4587594577423826e-06, + "loss": 0.8588, + "step": 10174 + }, + { + "epoch": 4.612420670897552, + "grad_norm": 0.077573066379191, + "learning_rate": 1.4553739403204791e-06, + "loss": 0.8602, + "step": 10175 + }, + { + "epoch": 4.612873980054397, + "grad_norm": 0.08141711237522148, + "learning_rate": 1.451992283256165e-06, + "loss": 0.8705, + "step": 10176 + }, + { + "epoch": 4.613327289211242, + "grad_norm": 0.07732765351464824, + "learning_rate": 1.4486144868881024e-06, + "loss": 0.8492, + "step": 10177 + }, + { + "epoch": 4.613780598368087, + "grad_norm": 0.08537843127040168, + "learning_rate": 1.445240551554603e-06, + "loss": 0.8482, + "step": 10178 + }, + { + "epoch": 4.614233907524932, + "grad_norm": 0.08392581438968359, + "learning_rate": 1.4418704775935698e-06, + "loss": 0.867, + "step": 10179 + }, + { + "epoch": 4.6146872166817765, + "grad_norm": 0.08241949649494777, + "learning_rate": 1.438504265342533e-06, + "loss": 0.8467, + "step": 10180 + }, + { + "epoch": 4.615140525838622, + "grad_norm": 0.08305159170139247, + "learning_rate": 1.4351419151386181e-06, + "loss": 0.8509, + "step": 10181 + }, + { + "epoch": 4.615593834995467, + "grad_norm": 0.086917350158416, + "learning_rate": 1.431783427318587e-06, + "loss": 0.8554, + "step": 10182 + }, + { + "epoch": 4.616047144152311, + "grad_norm": 0.08142217555927868, + "learning_rate": 1.428428802218793e-06, + "loss": 0.8721, + "step": 10183 + }, + { + "epoch": 4.616500453309157, + "grad_norm": 0.08544635403017638, + "learning_rate": 1.4250780401752163e-06, + "loss": 0.8571, + "step": 10184 + }, + { + "epoch": 4.616953762466002, + "grad_norm": 0.08619153292109362, + "learning_rate": 1.421731141523437e-06, + "loss": 0.8719, + "step": 10185 + }, + { + "epoch": 4.617407071622846, + "grad_norm": 0.0838216050079352, + "learning_rate": 1.4183881065986716e-06, + "loss": 0.8486, + "step": 10186 + }, + { + "epoch": 4.617860380779692, + "grad_norm": 0.0836652145125662, + "learning_rate": 1.4150489357357232e-06, + "loss": 0.8535, + "step": 10187 + }, + { + "epoch": 4.618313689936537, + "grad_norm": 0.0883383338373107, + "learning_rate": 1.4117136292690225e-06, + "loss": 0.8656, + "step": 10188 + }, + { + "epoch": 4.618766999093381, + "grad_norm": 0.08096257986165727, + "learning_rate": 1.408382187532613e-06, + "loss": 0.8424, + "step": 10189 + }, + { + "epoch": 4.619220308250227, + "grad_norm": 0.08738304097868256, + "learning_rate": 1.4050546108601393e-06, + "loss": 0.8605, + "step": 10190 + }, + { + "epoch": 4.619673617407072, + "grad_norm": 0.08106371695053786, + "learning_rate": 1.4017308995848811e-06, + "loss": 0.842, + "step": 10191 + }, + { + "epoch": 4.620126926563916, + "grad_norm": 0.08290911657435057, + "learning_rate": 1.39841105403971e-06, + "loss": 0.8546, + "step": 10192 + }, + { + "epoch": 4.6205802357207615, + "grad_norm": 0.08224514214183769, + "learning_rate": 1.3950950745571202e-06, + "loss": 0.8412, + "step": 10193 + }, + { + "epoch": 4.621033544877607, + "grad_norm": 0.08785460568663624, + "learning_rate": 1.3917829614692102e-06, + "loss": 0.8598, + "step": 10194 + }, + { + "epoch": 4.621486854034451, + "grad_norm": 0.08691394722645641, + "learning_rate": 1.3884747151077193e-06, + "loss": 0.8568, + "step": 10195 + }, + { + "epoch": 4.621940163191296, + "grad_norm": 0.09073682503418257, + "learning_rate": 1.3851703358039515e-06, + "loss": 0.8506, + "step": 10196 + }, + { + "epoch": 4.622393472348142, + "grad_norm": 0.09102296604466163, + "learning_rate": 1.3818698238888638e-06, + "loss": 0.8711, + "step": 10197 + }, + { + "epoch": 4.622846781504986, + "grad_norm": 0.08609006182825715, + "learning_rate": 1.3785731796930058e-06, + "loss": 0.8572, + "step": 10198 + }, + { + "epoch": 4.623300090661831, + "grad_norm": 0.08179966745174759, + "learning_rate": 1.3752804035465573e-06, + "loss": 0.8524, + "step": 10199 + }, + { + "epoch": 4.623753399818677, + "grad_norm": 0.08132688069403068, + "learning_rate": 1.3719914957792857e-06, + "loss": 0.839, + "step": 10200 + }, + { + "epoch": 4.624206708975521, + "grad_norm": 0.08738395991123449, + "learning_rate": 1.3687064567205943e-06, + "loss": 0.8518, + "step": 10201 + }, + { + "epoch": 4.624660018132366, + "grad_norm": 0.07921385066877787, + "learning_rate": 1.365425286699491e-06, + "loss": 0.8497, + "step": 10202 + }, + { + "epoch": 4.6251133272892115, + "grad_norm": 0.07992884446481473, + "learning_rate": 1.3621479860445796e-06, + "loss": 0.8592, + "step": 10203 + }, + { + "epoch": 4.625566636446056, + "grad_norm": 0.08720265290974383, + "learning_rate": 1.358874555084113e-06, + "loss": 0.8425, + "step": 10204 + }, + { + "epoch": 4.626019945602901, + "grad_norm": 0.0840894496459717, + "learning_rate": 1.3556049941459138e-06, + "loss": 0.8478, + "step": 10205 + }, + { + "epoch": 4.6264732547597465, + "grad_norm": 0.08739064694104758, + "learning_rate": 1.3523393035574573e-06, + "loss": 0.8431, + "step": 10206 + }, + { + "epoch": 4.626926563916591, + "grad_norm": 0.08193686696049078, + "learning_rate": 1.3490774836457977e-06, + "loss": 0.8432, + "step": 10207 + }, + { + "epoch": 4.627379873073436, + "grad_norm": 0.43625266955677194, + "learning_rate": 1.3458195347376202e-06, + "loss": 0.8409, + "step": 10208 + }, + { + "epoch": 4.627833182230281, + "grad_norm": 0.08329803294979264, + "learning_rate": 1.3425654571592196e-06, + "loss": 0.853, + "step": 10209 + }, + { + "epoch": 4.628286491387126, + "grad_norm": 0.07921872414221334, + "learning_rate": 1.3393152512365037e-06, + "loss": 0.8377, + "step": 10210 + }, + { + "epoch": 4.628739800543971, + "grad_norm": 0.08375447197563149, + "learning_rate": 1.3360689172949814e-06, + "loss": 0.8543, + "step": 10211 + }, + { + "epoch": 4.629193109700816, + "grad_norm": 0.08902933382291336, + "learning_rate": 1.3328264556597969e-06, + "loss": 0.8356, + "step": 10212 + }, + { + "epoch": 4.629646418857661, + "grad_norm": 0.09855572760926141, + "learning_rate": 1.3295878666556816e-06, + "loss": 0.8807, + "step": 10213 + }, + { + "epoch": 4.630099728014506, + "grad_norm": 0.09311249527101044, + "learning_rate": 1.3263531506069893e-06, + "loss": 0.8404, + "step": 10214 + }, + { + "epoch": 4.630553037171351, + "grad_norm": 0.08928518603513245, + "learning_rate": 1.3231223078376876e-06, + "loss": 0.8867, + "step": 10215 + }, + { + "epoch": 4.631006346328196, + "grad_norm": 0.08330135935910606, + "learning_rate": 1.3198953386713576e-06, + "loss": 0.879, + "step": 10216 + }, + { + "epoch": 4.631459655485041, + "grad_norm": 0.10309709250964556, + "learning_rate": 1.3166722434311896e-06, + "loss": 0.8796, + "step": 10217 + }, + { + "epoch": 4.631912964641886, + "grad_norm": 0.08723878183028032, + "learning_rate": 1.3134530224399788e-06, + "loss": 0.8405, + "step": 10218 + }, + { + "epoch": 4.632366273798731, + "grad_norm": 0.09334395755053922, + "learning_rate": 1.3102376760201518e-06, + "loss": 0.8419, + "step": 10219 + }, + { + "epoch": 4.632819582955576, + "grad_norm": 0.08419453116713094, + "learning_rate": 1.307026204493722e-06, + "loss": 0.8624, + "step": 10220 + }, + { + "epoch": 4.633272892112421, + "grad_norm": 0.084389703523477, + "learning_rate": 1.303818608182339e-06, + "loss": 0.8496, + "step": 10221 + }, + { + "epoch": 4.6337262012692655, + "grad_norm": 0.08045157404617335, + "learning_rate": 1.3006148874072477e-06, + "loss": 0.8431, + "step": 10222 + }, + { + "epoch": 4.634179510426111, + "grad_norm": 0.08492911369993257, + "learning_rate": 1.2974150424893117e-06, + "loss": 0.8623, + "step": 10223 + }, + { + "epoch": 4.634632819582956, + "grad_norm": 0.08072136224078866, + "learning_rate": 1.294219073748999e-06, + "loss": 0.8478, + "step": 10224 + }, + { + "epoch": 4.6350861287398, + "grad_norm": 0.0807305148706993, + "learning_rate": 1.2910269815064047e-06, + "loss": 0.8336, + "step": 10225 + }, + { + "epoch": 4.635539437896646, + "grad_norm": 0.08182469784674984, + "learning_rate": 1.2878387660812197e-06, + "loss": 0.8414, + "step": 10226 + }, + { + "epoch": 4.635992747053491, + "grad_norm": 0.08900804865765982, + "learning_rate": 1.2846544277927575e-06, + "loss": 0.8725, + "step": 10227 + }, + { + "epoch": 4.636446056210335, + "grad_norm": 0.08863144521141249, + "learning_rate": 1.2814739669599407e-06, + "loss": 0.8423, + "step": 10228 + }, + { + "epoch": 4.636899365367181, + "grad_norm": 0.0816133159397302, + "learning_rate": 1.278297383901288e-06, + "loss": 0.8432, + "step": 10229 + }, + { + "epoch": 4.637352674524025, + "grad_norm": 0.08061435891744646, + "learning_rate": 1.2751246789349624e-06, + "loss": 0.8464, + "step": 10230 + }, + { + "epoch": 4.63780598368087, + "grad_norm": 0.09026993285419208, + "learning_rate": 1.2719558523787012e-06, + "loss": 0.8346, + "step": 10231 + }, + { + "epoch": 4.6382592928377155, + "grad_norm": 0.07823426972469903, + "learning_rate": 1.2687909045498903e-06, + "loss": 0.8516, + "step": 10232 + }, + { + "epoch": 4.63871260199456, + "grad_norm": 0.07976643520740555, + "learning_rate": 1.2656298357654939e-06, + "loss": 0.8689, + "step": 10233 + }, + { + "epoch": 4.639165911151405, + "grad_norm": 0.08762724184918971, + "learning_rate": 1.2624726463421122e-06, + "loss": 0.8415, + "step": 10234 + }, + { + "epoch": 4.6396192203082505, + "grad_norm": 0.08601800782931537, + "learning_rate": 1.2593193365959366e-06, + "loss": 0.8708, + "step": 10235 + }, + { + "epoch": 4.640072529465095, + "grad_norm": 0.08774483912042835, + "learning_rate": 1.2561699068427902e-06, + "loss": 0.8711, + "step": 10236 + }, + { + "epoch": 4.64052583862194, + "grad_norm": 0.08255004535358199, + "learning_rate": 1.2530243573980916e-06, + "loss": 0.8521, + "step": 10237 + }, + { + "epoch": 4.640979147778785, + "grad_norm": 0.0811276853474464, + "learning_rate": 1.2498826885768821e-06, + "loss": 0.837, + "step": 10238 + }, + { + "epoch": 4.64143245693563, + "grad_norm": 0.08433216716340668, + "learning_rate": 1.2467449006937993e-06, + "loss": 0.8509, + "step": 10239 + }, + { + "epoch": 4.641885766092475, + "grad_norm": 0.08030387089161442, + "learning_rate": 1.2436109940631157e-06, + "loss": 0.867, + "step": 10240 + }, + { + "epoch": 4.64233907524932, + "grad_norm": 0.08361523104960206, + "learning_rate": 1.2404809689986874e-06, + "loss": 0.8527, + "step": 10241 + }, + { + "epoch": 4.642792384406165, + "grad_norm": 0.08021052375059869, + "learning_rate": 1.2373548258140056e-06, + "loss": 0.8503, + "step": 10242 + }, + { + "epoch": 4.64324569356301, + "grad_norm": 0.07997501898236963, + "learning_rate": 1.234232564822162e-06, + "loss": 0.8552, + "step": 10243 + }, + { + "epoch": 4.643699002719855, + "grad_norm": 0.09441314915190116, + "learning_rate": 1.2311141863358534e-06, + "loss": 0.8637, + "step": 10244 + }, + { + "epoch": 4.6441523118767, + "grad_norm": 0.08438666285250385, + "learning_rate": 1.2279996906673985e-06, + "loss": 0.8472, + "step": 10245 + }, + { + "epoch": 4.644605621033545, + "grad_norm": 0.08161321318792106, + "learning_rate": 1.2248890781287216e-06, + "loss": 0.8644, + "step": 10246 + }, + { + "epoch": 4.64505893019039, + "grad_norm": 0.0804762556344347, + "learning_rate": 1.2217823490313685e-06, + "loss": 0.8648, + "step": 10247 + }, + { + "epoch": 4.645512239347235, + "grad_norm": 0.08705647578662999, + "learning_rate": 1.218679503686473e-06, + "loss": 0.8552, + "step": 10248 + }, + { + "epoch": 4.64596554850408, + "grad_norm": 0.08102731304107434, + "learning_rate": 1.2155805424048085e-06, + "loss": 0.8659, + "step": 10249 + }, + { + "epoch": 4.646418857660925, + "grad_norm": 0.08621473980959787, + "learning_rate": 1.212485465496731e-06, + "loss": 0.8529, + "step": 10250 + }, + { + "epoch": 4.6468721668177695, + "grad_norm": 0.08035771754127566, + "learning_rate": 1.209394273272233e-06, + "loss": 0.8259, + "step": 10251 + }, + { + "epoch": 4.647325475974615, + "grad_norm": 0.08139225420847718, + "learning_rate": 1.2063069660408978e-06, + "loss": 0.8513, + "step": 10252 + }, + { + "epoch": 4.64777878513146, + "grad_norm": 0.09565437538719479, + "learning_rate": 1.2032235441119399e-06, + "loss": 0.8718, + "step": 10253 + }, + { + "epoch": 4.648232094288304, + "grad_norm": 0.0805245505096966, + "learning_rate": 1.2001440077941618e-06, + "loss": 0.8568, + "step": 10254 + }, + { + "epoch": 4.64868540344515, + "grad_norm": 0.08238639346356212, + "learning_rate": 1.1970683573959917e-06, + "loss": 0.85, + "step": 10255 + }, + { + "epoch": 4.649138712601994, + "grad_norm": 0.10497588946310893, + "learning_rate": 1.1939965932254638e-06, + "loss": 0.8782, + "step": 10256 + }, + { + "epoch": 4.649592021758839, + "grad_norm": 0.09353816793736226, + "learning_rate": 1.1909287155902295e-06, + "loss": 0.8565, + "step": 10257 + }, + { + "epoch": 4.650045330915685, + "grad_norm": 0.08895079652944589, + "learning_rate": 1.1878647247975406e-06, + "loss": 0.8548, + "step": 10258 + }, + { + "epoch": 4.650498640072529, + "grad_norm": 0.08379417328029756, + "learning_rate": 1.184804621154263e-06, + "loss": 0.8492, + "step": 10259 + }, + { + "epoch": 4.650951949229374, + "grad_norm": 0.10428936506276955, + "learning_rate": 1.1817484049668804e-06, + "loss": 0.8318, + "step": 10260 + }, + { + "epoch": 4.6514052583862195, + "grad_norm": 0.08466151348197709, + "learning_rate": 1.1786960765414768e-06, + "loss": 0.848, + "step": 10261 + }, + { + "epoch": 4.651858567543064, + "grad_norm": 0.07812052474808723, + "learning_rate": 1.1756476361837632e-06, + "loss": 0.8361, + "step": 10262 + }, + { + "epoch": 4.652311876699909, + "grad_norm": 0.08241980530222418, + "learning_rate": 1.1726030841990332e-06, + "loss": 0.8501, + "step": 10263 + }, + { + "epoch": 4.6527651858567545, + "grad_norm": 0.09527513924536729, + "learning_rate": 1.1695624208922207e-06, + "loss": 0.8423, + "step": 10264 + }, + { + "epoch": 4.653218495013599, + "grad_norm": 0.09324580837494272, + "learning_rate": 1.1665256465678465e-06, + "loss": 0.8616, + "step": 10265 + }, + { + "epoch": 4.653671804170444, + "grad_norm": 0.08515676444305936, + "learning_rate": 1.163492761530063e-06, + "loss": 0.8607, + "step": 10266 + }, + { + "epoch": 4.654125113327289, + "grad_norm": 0.08235281776592451, + "learning_rate": 1.160463766082618e-06, + "loss": 0.863, + "step": 10267 + }, + { + "epoch": 4.654578422484134, + "grad_norm": 0.08597626642659562, + "learning_rate": 1.1574386605288734e-06, + "loss": 0.8534, + "step": 10268 + }, + { + "epoch": 4.655031731640979, + "grad_norm": 0.08768399886473213, + "learning_rate": 1.154417445171805e-06, + "loss": 0.842, + "step": 10269 + }, + { + "epoch": 4.655485040797824, + "grad_norm": 0.08074265977706853, + "learning_rate": 1.151400120313988e-06, + "loss": 0.8522, + "step": 10270 + }, + { + "epoch": 4.655938349954669, + "grad_norm": 0.08491216766583784, + "learning_rate": 1.1483866862576298e-06, + "loss": 0.8618, + "step": 10271 + }, + { + "epoch": 4.656391659111514, + "grad_norm": 0.0749059905723375, + "learning_rate": 1.1453771433045292e-06, + "loss": 0.8441, + "step": 10272 + }, + { + "epoch": 4.656844968268359, + "grad_norm": 0.0763796867693737, + "learning_rate": 1.142371491756098e-06, + "loss": 0.8429, + "step": 10273 + }, + { + "epoch": 4.657298277425204, + "grad_norm": 0.07661993034501992, + "learning_rate": 1.1393697319133578e-06, + "loss": 0.8342, + "step": 10274 + }, + { + "epoch": 4.657751586582049, + "grad_norm": 0.08052394712520733, + "learning_rate": 1.1363718640769483e-06, + "loss": 0.8527, + "step": 10275 + }, + { + "epoch": 4.658204895738894, + "grad_norm": 0.09110354534252649, + "learning_rate": 1.1333778885471181e-06, + "loss": 0.8291, + "step": 10276 + }, + { + "epoch": 4.658658204895739, + "grad_norm": 0.08332523595898515, + "learning_rate": 1.130387805623716e-06, + "loss": 0.8746, + "step": 10277 + }, + { + "epoch": 4.659111514052584, + "grad_norm": 0.08535711805455207, + "learning_rate": 1.1274016156062139e-06, + "loss": 0.8368, + "step": 10278 + }, + { + "epoch": 4.659564823209429, + "grad_norm": 0.07461025715100163, + "learning_rate": 1.1244193187936836e-06, + "loss": 0.8572, + "step": 10279 + }, + { + "epoch": 4.6600181323662735, + "grad_norm": 0.08183931266326715, + "learning_rate": 1.1214409154848106e-06, + "loss": 0.8465, + "step": 10280 + }, + { + "epoch": 4.660471441523119, + "grad_norm": 0.0799183899008404, + "learning_rate": 1.1184664059778938e-06, + "loss": 0.8742, + "step": 10281 + }, + { + "epoch": 4.660924750679964, + "grad_norm": 0.07807190828579443, + "learning_rate": 1.115495790570833e-06, + "loss": 0.8366, + "step": 10282 + }, + { + "epoch": 4.661378059836808, + "grad_norm": 0.08313514939242321, + "learning_rate": 1.1125290695611545e-06, + "loss": 0.8503, + "step": 10283 + }, + { + "epoch": 4.661831368993654, + "grad_norm": 0.0783356615616068, + "learning_rate": 1.1095662432459764e-06, + "loss": 0.8607, + "step": 10284 + }, + { + "epoch": 4.662284678150499, + "grad_norm": 0.08627322183070979, + "learning_rate": 1.10660731192203e-06, + "loss": 0.8588, + "step": 10285 + }, + { + "epoch": 4.662737987307343, + "grad_norm": 0.07670561508243476, + "learning_rate": 1.1036522758856739e-06, + "loss": 0.8759, + "step": 10286 + }, + { + "epoch": 4.663191296464189, + "grad_norm": 0.08399874262006352, + "learning_rate": 1.100701135432849e-06, + "loss": 0.8362, + "step": 10287 + }, + { + "epoch": 4.663644605621034, + "grad_norm": 0.07916803887202191, + "learning_rate": 1.0977538908591368e-06, + "loss": 0.8547, + "step": 10288 + }, + { + "epoch": 4.664097914777878, + "grad_norm": 0.0766271583244182, + "learning_rate": 1.0948105424596966e-06, + "loss": 0.8234, + "step": 10289 + }, + { + "epoch": 4.6645512239347235, + "grad_norm": 0.07783397611699584, + "learning_rate": 1.0918710905293283e-06, + "loss": 0.8555, + "step": 10290 + }, + { + "epoch": 4.665004533091569, + "grad_norm": 0.08880362929959891, + "learning_rate": 1.0889355353624142e-06, + "loss": 0.8579, + "step": 10291 + }, + { + "epoch": 4.665457842248413, + "grad_norm": 0.07769592747550845, + "learning_rate": 1.086003877252968e-06, + "loss": 0.8382, + "step": 10292 + }, + { + "epoch": 4.6659111514052585, + "grad_norm": 0.08807223048671711, + "learning_rate": 1.0830761164945946e-06, + "loss": 0.8695, + "step": 10293 + }, + { + "epoch": 4.666364460562104, + "grad_norm": 0.07796563338483531, + "learning_rate": 1.0801522533805263e-06, + "loss": 0.8571, + "step": 10294 + }, + { + "epoch": 4.666817769718948, + "grad_norm": 0.0811888113701953, + "learning_rate": 1.0772322882035868e-06, + "loss": 0.855, + "step": 10295 + }, + { + "epoch": 4.667271078875793, + "grad_norm": 0.07826003821550036, + "learning_rate": 1.0743162212562352e-06, + "loss": 0.8593, + "step": 10296 + }, + { + "epoch": 4.667724388032639, + "grad_norm": 0.08773480267274417, + "learning_rate": 1.0714040528305047e-06, + "loss": 0.8492, + "step": 10297 + }, + { + "epoch": 4.668177697189483, + "grad_norm": 0.0793023755438468, + "learning_rate": 1.0684957832180732e-06, + "loss": 0.8598, + "step": 10298 + }, + { + "epoch": 4.668631006346328, + "grad_norm": 0.07866815191546359, + "learning_rate": 1.0655914127102096e-06, + "loss": 0.8418, + "step": 10299 + }, + { + "epoch": 4.669084315503174, + "grad_norm": 0.08836238957939886, + "learning_rate": 1.0626909415977838e-06, + "loss": 0.8521, + "step": 10300 + }, + { + "epoch": 4.669537624660018, + "grad_norm": 0.08158824368010331, + "learning_rate": 1.0597943701713009e-06, + "loss": 0.838, + "step": 10301 + }, + { + "epoch": 4.669990933816863, + "grad_norm": 0.08074154456932851, + "learning_rate": 1.0569016987208536e-06, + "loss": 0.8566, + "step": 10302 + }, + { + "epoch": 4.6704442429737085, + "grad_norm": 0.08118375997279777, + "learning_rate": 1.0540129275361521e-06, + "loss": 0.8681, + "step": 10303 + }, + { + "epoch": 4.670897552130553, + "grad_norm": 0.07779431153967861, + "learning_rate": 1.0511280569065164e-06, + "loss": 0.8602, + "step": 10304 + }, + { + "epoch": 4.671350861287398, + "grad_norm": 0.08951279121371812, + "learning_rate": 1.048247087120875e-06, + "loss": 0.8661, + "step": 10305 + }, + { + "epoch": 4.6718041704442435, + "grad_norm": 0.0753297551955938, + "learning_rate": 1.0453700184677617e-06, + "loss": 0.8473, + "step": 10306 + }, + { + "epoch": 4.672257479601088, + "grad_norm": 0.08716117397763107, + "learning_rate": 1.0424968512353327e-06, + "loss": 0.8552, + "step": 10307 + }, + { + "epoch": 4.672710788757933, + "grad_norm": 0.08435222882386134, + "learning_rate": 1.0396275857113315e-06, + "loss": 0.8636, + "step": 10308 + }, + { + "epoch": 4.6731640979147775, + "grad_norm": 0.07944267225946433, + "learning_rate": 1.0367622221831363e-06, + "loss": 0.8533, + "step": 10309 + }, + { + "epoch": 4.673617407071623, + "grad_norm": 0.07529363690405662, + "learning_rate": 1.033900760937714e-06, + "loss": 0.861, + "step": 10310 + }, + { + "epoch": 4.674070716228468, + "grad_norm": 0.08406281017368683, + "learning_rate": 1.0310432022616478e-06, + "loss": 0.8549, + "step": 10311 + }, + { + "epoch": 4.674524025385312, + "grad_norm": 0.0821212252467358, + "learning_rate": 1.0281895464411317e-06, + "loss": 0.8459, + "step": 10312 + }, + { + "epoch": 4.674977334542158, + "grad_norm": 0.08054727091034117, + "learning_rate": 1.025339793761968e-06, + "loss": 0.8561, + "step": 10313 + }, + { + "epoch": 4.675430643699003, + "grad_norm": 0.07775868533122104, + "learning_rate": 1.0224939445095727e-06, + "loss": 0.8501, + "step": 10314 + }, + { + "epoch": 4.675883952855847, + "grad_norm": 0.08314379693498357, + "learning_rate": 1.019651998968958e-06, + "loss": 0.852, + "step": 10315 + }, + { + "epoch": 4.676337262012693, + "grad_norm": 0.08848947967985878, + "learning_rate": 1.0168139574247583e-06, + "loss": 0.8597, + "step": 10316 + }, + { + "epoch": 4.676790571169538, + "grad_norm": 0.08431696060701657, + "learning_rate": 1.0139798201612084e-06, + "loss": 0.8592, + "step": 10317 + }, + { + "epoch": 4.677243880326382, + "grad_norm": 0.08457523650437959, + "learning_rate": 1.0111495874621568e-06, + "loss": 0.8616, + "step": 10318 + }, + { + "epoch": 4.6776971894832275, + "grad_norm": 0.07728027188520409, + "learning_rate": 1.0083232596110616e-06, + "loss": 0.8666, + "step": 10319 + }, + { + "epoch": 4.678150498640073, + "grad_norm": 0.08374508376246877, + "learning_rate": 1.005500836890989e-06, + "loss": 0.8579, + "step": 10320 + }, + { + "epoch": 4.678603807796917, + "grad_norm": 0.08462321338740461, + "learning_rate": 1.0026823195846025e-06, + "loss": 0.8506, + "step": 10321 + }, + { + "epoch": 4.6790571169537625, + "grad_norm": 0.07984100999770798, + "learning_rate": 9.998677079742003e-07, + "loss": 0.8539, + "step": 10322 + }, + { + "epoch": 4.679510426110608, + "grad_norm": 0.07942336234790515, + "learning_rate": 9.970570023416637e-07, + "loss": 0.8472, + "step": 10323 + }, + { + "epoch": 4.679963735267452, + "grad_norm": 0.08853021385847323, + "learning_rate": 9.942502029684964e-07, + "loss": 0.8444, + "step": 10324 + }, + { + "epoch": 4.680417044424297, + "grad_norm": 0.07886197705796609, + "learning_rate": 9.914473101358114e-07, + "loss": 0.8322, + "step": 10325 + }, + { + "epoch": 4.680870353581143, + "grad_norm": 0.08430922202367204, + "learning_rate": 9.886483241243128e-07, + "loss": 0.8616, + "step": 10326 + }, + { + "epoch": 4.681323662737987, + "grad_norm": 0.07646282333527174, + "learning_rate": 9.858532452143454e-07, + "loss": 0.8468, + "step": 10327 + }, + { + "epoch": 4.681776971894832, + "grad_norm": 0.0770964666470925, + "learning_rate": 9.830620736858277e-07, + "loss": 0.8665, + "step": 10328 + }, + { + "epoch": 4.682230281051678, + "grad_norm": 0.07289597439631229, + "learning_rate": 9.802748098183178e-07, + "loss": 0.8567, + "step": 10329 + }, + { + "epoch": 4.682683590208522, + "grad_norm": 0.08787613515572393, + "learning_rate": 9.774914538909664e-07, + "loss": 0.8529, + "step": 10330 + }, + { + "epoch": 4.683136899365367, + "grad_norm": 0.08025542842879523, + "learning_rate": 9.747120061825277e-07, + "loss": 0.8597, + "step": 10331 + }, + { + "epoch": 4.6835902085222125, + "grad_norm": 0.08496671277187973, + "learning_rate": 9.719364669713705e-07, + "loss": 0.8565, + "step": 10332 + }, + { + "epoch": 4.684043517679057, + "grad_norm": 0.07210923644843098, + "learning_rate": 9.691648365354812e-07, + "loss": 0.8453, + "step": 10333 + }, + { + "epoch": 4.684496826835902, + "grad_norm": 0.07677899547319109, + "learning_rate": 9.663971151524375e-07, + "loss": 0.8422, + "step": 10334 + }, + { + "epoch": 4.684950135992747, + "grad_norm": 0.07531899041184287, + "learning_rate": 9.636333030994493e-07, + "loss": 0.8597, + "step": 10335 + }, + { + "epoch": 4.685403445149592, + "grad_norm": 0.07816918756879536, + "learning_rate": 9.608734006533039e-07, + "loss": 0.8467, + "step": 10336 + }, + { + "epoch": 4.685856754306437, + "grad_norm": 0.0817867802081224, + "learning_rate": 9.581174080904244e-07, + "loss": 0.846, + "step": 10337 + }, + { + "epoch": 4.6863100634632815, + "grad_norm": 0.07507899236612056, + "learning_rate": 9.553653256868257e-07, + "loss": 0.8443, + "step": 10338 + }, + { + "epoch": 4.686763372620127, + "grad_norm": 0.07284349194058688, + "learning_rate": 9.52617153718145e-07, + "loss": 0.8421, + "step": 10339 + }, + { + "epoch": 4.687216681776972, + "grad_norm": 0.07753840701367398, + "learning_rate": 9.498728924596112e-07, + "loss": 0.8393, + "step": 10340 + }, + { + "epoch": 4.687669990933816, + "grad_norm": 0.07936036936169073, + "learning_rate": 9.471325421860756e-07, + "loss": 0.8527, + "step": 10341 + }, + { + "epoch": 4.688123300090662, + "grad_norm": 0.08084397013896552, + "learning_rate": 9.443961031719895e-07, + "loss": 0.8348, + "step": 10342 + }, + { + "epoch": 4.688576609247507, + "grad_norm": 0.09211124730011015, + "learning_rate": 9.416635756914094e-07, + "loss": 0.8885, + "step": 10343 + }, + { + "epoch": 4.689029918404351, + "grad_norm": 0.07787928289199046, + "learning_rate": 9.389349600180231e-07, + "loss": 0.8426, + "step": 10344 + }, + { + "epoch": 4.689483227561197, + "grad_norm": 0.08185835824844158, + "learning_rate": 9.362102564250919e-07, + "loss": 0.8396, + "step": 10345 + }, + { + "epoch": 4.689936536718042, + "grad_norm": 0.07871298176268914, + "learning_rate": 9.334894651855131e-07, + "loss": 0.8339, + "step": 10346 + }, + { + "epoch": 4.690389845874886, + "grad_norm": 0.0772912937546811, + "learning_rate": 9.307725865717754e-07, + "loss": 0.8637, + "step": 10347 + }, + { + "epoch": 4.6908431550317315, + "grad_norm": 0.08654827277046648, + "learning_rate": 9.2805962085599e-07, + "loss": 0.8606, + "step": 10348 + }, + { + "epoch": 4.691296464188577, + "grad_norm": 0.07265748410102534, + "learning_rate": 9.253505683098596e-07, + "loss": 0.8494, + "step": 10349 + }, + { + "epoch": 4.691749773345421, + "grad_norm": 0.07779190504053135, + "learning_rate": 9.226454292047138e-07, + "loss": 0.8427, + "step": 10350 + }, + { + "epoch": 4.6922030825022665, + "grad_norm": 0.07576050226007174, + "learning_rate": 9.199442038114692e-07, + "loss": 0.8643, + "step": 10351 + }, + { + "epoch": 4.692656391659112, + "grad_norm": 0.08459599029463494, + "learning_rate": 9.172468924006695e-07, + "loss": 0.855, + "step": 10352 + }, + { + "epoch": 4.693109700815956, + "grad_norm": 0.07749264897049125, + "learning_rate": 9.145534952424495e-07, + "loss": 0.8489, + "step": 10353 + }, + { + "epoch": 4.693563009972801, + "grad_norm": 0.07476215471437203, + "learning_rate": 9.118640126065759e-07, + "loss": 0.8465, + "step": 10354 + }, + { + "epoch": 4.694016319129647, + "grad_norm": 0.08021360855037181, + "learning_rate": 9.09178444762393e-07, + "loss": 0.8568, + "step": 10355 + }, + { + "epoch": 4.694469628286491, + "grad_norm": 0.08842023167565115, + "learning_rate": 9.064967919788769e-07, + "loss": 0.8708, + "step": 10356 + }, + { + "epoch": 4.694922937443336, + "grad_norm": 0.07238185276840521, + "learning_rate": 9.038190545246039e-07, + "loss": 0.8418, + "step": 10357 + }, + { + "epoch": 4.695376246600182, + "grad_norm": 0.08080358175318819, + "learning_rate": 9.011452326677506e-07, + "loss": 0.864, + "step": 10358 + }, + { + "epoch": 4.695829555757026, + "grad_norm": 0.0861168965746262, + "learning_rate": 8.984753266761115e-07, + "loss": 0.873, + "step": 10359 + }, + { + "epoch": 4.696282864913871, + "grad_norm": 0.0776767138691433, + "learning_rate": 8.958093368170861e-07, + "loss": 0.8762, + "step": 10360 + }, + { + "epoch": 4.6967361740707165, + "grad_norm": 0.08432072132897643, + "learning_rate": 8.931472633576787e-07, + "loss": 0.8755, + "step": 10361 + }, + { + "epoch": 4.697189483227561, + "grad_norm": 0.07812483656609057, + "learning_rate": 8.904891065645072e-07, + "loss": 0.838, + "step": 10362 + }, + { + "epoch": 4.697642792384406, + "grad_norm": 0.07701477876608301, + "learning_rate": 8.878348667037939e-07, + "loss": 0.8533, + "step": 10363 + }, + { + "epoch": 4.6980961015412515, + "grad_norm": 0.08038449448098034, + "learning_rate": 8.851845440413664e-07, + "loss": 0.8772, + "step": 10364 + }, + { + "epoch": 4.698549410698096, + "grad_norm": 0.07625133474569426, + "learning_rate": 8.825381388426657e-07, + "loss": 0.8731, + "step": 10365 + }, + { + "epoch": 4.699002719854941, + "grad_norm": 0.07947829984948473, + "learning_rate": 8.798956513727331e-07, + "loss": 0.8612, + "step": 10366 + }, + { + "epoch": 4.699456029011786, + "grad_norm": 0.08000081493408694, + "learning_rate": 8.77257081896219e-07, + "loss": 0.856, + "step": 10367 + }, + { + "epoch": 4.699909338168631, + "grad_norm": 0.07166944055760277, + "learning_rate": 8.746224306773921e-07, + "loss": 0.86, + "step": 10368 + }, + { + "epoch": 4.700362647325476, + "grad_norm": 0.07647669007584529, + "learning_rate": 8.719916979801169e-07, + "loss": 0.8598, + "step": 10369 + }, + { + "epoch": 4.700815956482321, + "grad_norm": 0.08169377260351328, + "learning_rate": 8.693648840678626e-07, + "loss": 0.8461, + "step": 10370 + }, + { + "epoch": 4.701269265639166, + "grad_norm": 0.0773019697430656, + "learning_rate": 8.66741989203721e-07, + "loss": 0.8643, + "step": 10371 + }, + { + "epoch": 4.701722574796011, + "grad_norm": 0.07780502086328502, + "learning_rate": 8.641230136503843e-07, + "loss": 0.8391, + "step": 10372 + }, + { + "epoch": 4.702175883952856, + "grad_norm": 0.07764785921934488, + "learning_rate": 8.615079576701402e-07, + "loss": 0.8509, + "step": 10373 + }, + { + "epoch": 4.702629193109701, + "grad_norm": 0.07714723027642348, + "learning_rate": 8.588968215249038e-07, + "loss": 0.8506, + "step": 10374 + }, + { + "epoch": 4.703082502266546, + "grad_norm": 0.07701686407952643, + "learning_rate": 8.562896054761771e-07, + "loss": 0.8414, + "step": 10375 + }, + { + "epoch": 4.703535811423391, + "grad_norm": 0.08360077395732916, + "learning_rate": 8.536863097850934e-07, + "loss": 0.8658, + "step": 10376 + }, + { + "epoch": 4.7039891205802356, + "grad_norm": 0.07784350357781399, + "learning_rate": 8.51086934712373e-07, + "loss": 0.8344, + "step": 10377 + }, + { + "epoch": 4.704442429737081, + "grad_norm": 0.07478338996218156, + "learning_rate": 8.484914805183542e-07, + "loss": 0.8552, + "step": 10378 + }, + { + "epoch": 4.704895738893926, + "grad_norm": 0.08261557029496815, + "learning_rate": 8.458999474629759e-07, + "loss": 0.8596, + "step": 10379 + }, + { + "epoch": 4.7053490480507705, + "grad_norm": 0.07612804172833412, + "learning_rate": 8.433123358057904e-07, + "loss": 0.8519, + "step": 10380 + }, + { + "epoch": 4.705802357207616, + "grad_norm": 0.0839137857785578, + "learning_rate": 8.407286458059549e-07, + "loss": 0.8502, + "step": 10381 + }, + { + "epoch": 4.706255666364461, + "grad_norm": 0.0743750892974289, + "learning_rate": 8.381488777222314e-07, + "loss": 0.8498, + "step": 10382 + }, + { + "epoch": 4.706708975521305, + "grad_norm": 0.08169721860473868, + "learning_rate": 8.355730318129951e-07, + "loss": 0.8437, + "step": 10383 + }, + { + "epoch": 4.707162284678151, + "grad_norm": 0.07861479815690094, + "learning_rate": 8.330011083362178e-07, + "loss": 0.8554, + "step": 10384 + }, + { + "epoch": 4.707615593834996, + "grad_norm": 0.08021063888425484, + "learning_rate": 8.304331075494931e-07, + "loss": 0.8412, + "step": 10385 + }, + { + "epoch": 4.70806890299184, + "grad_norm": 0.07755659888558893, + "learning_rate": 8.278690297100067e-07, + "loss": 0.8741, + "step": 10386 + }, + { + "epoch": 4.708522212148686, + "grad_norm": 0.07831612751803646, + "learning_rate": 8.253088750745664e-07, + "loss": 0.8505, + "step": 10387 + }, + { + "epoch": 4.70897552130553, + "grad_norm": 0.0815953303757934, + "learning_rate": 8.22752643899567e-07, + "loss": 0.851, + "step": 10388 + }, + { + "epoch": 4.709428830462375, + "grad_norm": 0.0745850088373066, + "learning_rate": 8.20200336441035e-07, + "loss": 0.8498, + "step": 10389 + }, + { + "epoch": 4.7098821396192205, + "grad_norm": 0.08471206359832521, + "learning_rate": 8.176519529545835e-07, + "loss": 0.8582, + "step": 10390 + }, + { + "epoch": 4.710335448776065, + "grad_norm": 0.08079468867560835, + "learning_rate": 8.151074936954484e-07, + "loss": 0.861, + "step": 10391 + }, + { + "epoch": 4.71078875793291, + "grad_norm": 0.07779165314259051, + "learning_rate": 8.12566958918457e-07, + "loss": 0.859, + "step": 10392 + }, + { + "epoch": 4.7112420670897555, + "grad_norm": 0.08837494571731042, + "learning_rate": 8.100303488780547e-07, + "loss": 0.8547, + "step": 10393 + }, + { + "epoch": 4.7116953762466, + "grad_norm": 0.07655619859536501, + "learning_rate": 8.074976638282873e-07, + "loss": 0.8464, + "step": 10394 + }, + { + "epoch": 4.712148685403445, + "grad_norm": 0.07715297339560906, + "learning_rate": 8.049689040228181e-07, + "loss": 0.8459, + "step": 10395 + }, + { + "epoch": 4.71260199456029, + "grad_norm": 0.07691209868145878, + "learning_rate": 8.024440697149027e-07, + "loss": 0.8496, + "step": 10396 + }, + { + "epoch": 4.713055303717135, + "grad_norm": 0.08216816556259131, + "learning_rate": 7.999231611574143e-07, + "loss": 0.8675, + "step": 10397 + }, + { + "epoch": 4.71350861287398, + "grad_norm": 0.07553661033843397, + "learning_rate": 7.974061786028309e-07, + "loss": 0.8455, + "step": 10398 + }, + { + "epoch": 4.713961922030825, + "grad_norm": 0.07264205923340905, + "learning_rate": 7.948931223032264e-07, + "loss": 0.853, + "step": 10399 + }, + { + "epoch": 4.71441523118767, + "grad_norm": 0.07856685808520712, + "learning_rate": 7.923839925103061e-07, + "loss": 0.8683, + "step": 10400 + }, + { + "epoch": 4.714868540344515, + "grad_norm": 0.07584155136926353, + "learning_rate": 7.89878789475349e-07, + "loss": 0.8322, + "step": 10401 + }, + { + "epoch": 4.71532184950136, + "grad_norm": 0.07625016069232818, + "learning_rate": 7.873775134492745e-07, + "loss": 0.8482, + "step": 10402 + }, + { + "epoch": 4.715775158658205, + "grad_norm": 0.07980612754747769, + "learning_rate": 7.848801646825798e-07, + "loss": 0.8497, + "step": 10403 + }, + { + "epoch": 4.71622846781505, + "grad_norm": 0.07805585002192222, + "learning_rate": 7.82386743425394e-07, + "loss": 0.8669, + "step": 10404 + }, + { + "epoch": 4.716681776971895, + "grad_norm": 0.07923332854826072, + "learning_rate": 7.798972499274282e-07, + "loss": 0.8783, + "step": 10405 + }, + { + "epoch": 4.7171350861287396, + "grad_norm": 0.07879236523001906, + "learning_rate": 7.774116844380253e-07, + "loss": 0.8495, + "step": 10406 + }, + { + "epoch": 4.717588395285585, + "grad_norm": 0.07894063173017604, + "learning_rate": 7.749300472061106e-07, + "loss": 0.8286, + "step": 10407 + }, + { + "epoch": 4.71804170444243, + "grad_norm": 0.0779215122352862, + "learning_rate": 7.724523384802318e-07, + "loss": 0.838, + "step": 10408 + }, + { + "epoch": 4.7184950135992745, + "grad_norm": 0.08197970814409854, + "learning_rate": 7.699785585085373e-07, + "loss": 0.8524, + "step": 10409 + }, + { + "epoch": 4.71894832275612, + "grad_norm": 0.07928573653492316, + "learning_rate": 7.675087075387932e-07, + "loss": 0.8635, + "step": 10410 + }, + { + "epoch": 4.719401631912965, + "grad_norm": 0.07960223080518192, + "learning_rate": 7.65042785818344e-07, + "loss": 0.8348, + "step": 10411 + }, + { + "epoch": 4.719854941069809, + "grad_norm": 0.07526280709445834, + "learning_rate": 7.625807935941742e-07, + "loss": 0.8677, + "step": 10412 + }, + { + "epoch": 4.720308250226655, + "grad_norm": 0.07678997329675195, + "learning_rate": 7.601227311128556e-07, + "loss": 0.8441, + "step": 10413 + }, + { + "epoch": 4.720761559383499, + "grad_norm": 0.07869386205056188, + "learning_rate": 7.576685986205689e-07, + "loss": 0.8561, + "step": 10414 + }, + { + "epoch": 4.721214868540344, + "grad_norm": 0.07517599012094191, + "learning_rate": 7.552183963631043e-07, + "loss": 0.8482, + "step": 10415 + }, + { + "epoch": 4.72166817769719, + "grad_norm": 0.08125465374642349, + "learning_rate": 7.527721245858522e-07, + "loss": 0.8481, + "step": 10416 + }, + { + "epoch": 4.722121486854034, + "grad_norm": 0.07951432617465318, + "learning_rate": 7.503297835338163e-07, + "loss": 0.857, + "step": 10417 + }, + { + "epoch": 4.722574796010879, + "grad_norm": 0.07397957579981453, + "learning_rate": 7.478913734516058e-07, + "loss": 0.8638, + "step": 10418 + }, + { + "epoch": 4.7230281051677245, + "grad_norm": 0.07554964705891283, + "learning_rate": 7.454568945834384e-07, + "loss": 0.8467, + "step": 10419 + }, + { + "epoch": 4.723481414324569, + "grad_norm": 0.08475357142684534, + "learning_rate": 7.430263471731236e-07, + "loss": 0.8268, + "step": 10420 + }, + { + "epoch": 4.723934723481414, + "grad_norm": 0.13189940041426523, + "learning_rate": 7.405997314641022e-07, + "loss": 0.8618, + "step": 10421 + }, + { + "epoch": 4.7243880326382595, + "grad_norm": 0.07900584048521118, + "learning_rate": 7.381770476993932e-07, + "loss": 0.8615, + "step": 10422 + }, + { + "epoch": 4.724841341795104, + "grad_norm": 0.08085104603336489, + "learning_rate": 7.357582961216425e-07, + "loss": 0.8543, + "step": 10423 + }, + { + "epoch": 4.725294650951949, + "grad_norm": 0.0845319849008638, + "learning_rate": 7.333434769730963e-07, + "loss": 0.855, + "step": 10424 + }, + { + "epoch": 4.725747960108794, + "grad_norm": 0.07652646418073943, + "learning_rate": 7.30932590495601e-07, + "loss": 0.8474, + "step": 10425 + }, + { + "epoch": 4.726201269265639, + "grad_norm": 0.07704463705651397, + "learning_rate": 7.28525636930617e-07, + "loss": 0.8436, + "step": 10426 + }, + { + "epoch": 4.726654578422484, + "grad_norm": 0.07412313006772382, + "learning_rate": 7.261226165192093e-07, + "loss": 0.8739, + "step": 10427 + }, + { + "epoch": 4.727107887579329, + "grad_norm": 0.08171732351136199, + "learning_rate": 7.237235295020428e-07, + "loss": 0.8218, + "step": 10428 + }, + { + "epoch": 4.727561196736174, + "grad_norm": 0.0767930271535892, + "learning_rate": 7.213283761193968e-07, + "loss": 0.8616, + "step": 10429 + }, + { + "epoch": 4.728014505893019, + "grad_norm": 0.10101333846777838, + "learning_rate": 7.189371566111503e-07, + "loss": 0.8479, + "step": 10430 + }, + { + "epoch": 4.728467815049864, + "grad_norm": 0.07300941713915256, + "learning_rate": 7.165498712167917e-07, + "loss": 0.856, + "step": 10431 + }, + { + "epoch": 4.728921124206709, + "grad_norm": 0.07451946196881841, + "learning_rate": 7.141665201754189e-07, + "loss": 0.8617, + "step": 10432 + }, + { + "epoch": 4.729374433363554, + "grad_norm": 0.07712922651897647, + "learning_rate": 7.117871037257207e-07, + "loss": 0.8492, + "step": 10433 + }, + { + "epoch": 4.729827742520399, + "grad_norm": 0.0826278666270488, + "learning_rate": 7.094116221060177e-07, + "loss": 0.8495, + "step": 10434 + }, + { + "epoch": 4.7302810516772436, + "grad_norm": 0.08368881456479722, + "learning_rate": 7.070400755542084e-07, + "loss": 0.8341, + "step": 10435 + }, + { + "epoch": 4.730734360834089, + "grad_norm": 0.07739848499238282, + "learning_rate": 7.046724643078229e-07, + "loss": 0.8565, + "step": 10436 + }, + { + "epoch": 4.731187669990934, + "grad_norm": 0.07499072803629842, + "learning_rate": 7.023087886039693e-07, + "loss": 0.8453, + "step": 10437 + }, + { + "epoch": 4.7316409791477785, + "grad_norm": 0.08909062801557299, + "learning_rate": 6.999490486793869e-07, + "loss": 0.8538, + "step": 10438 + }, + { + "epoch": 4.732094288304624, + "grad_norm": 0.09046542127478084, + "learning_rate": 6.975932447704115e-07, + "loss": 0.8334, + "step": 10439 + }, + { + "epoch": 4.732547597461469, + "grad_norm": 0.07925448923085325, + "learning_rate": 6.95241377112974e-07, + "loss": 0.8486, + "step": 10440 + }, + { + "epoch": 4.733000906618313, + "grad_norm": 0.08618513240496478, + "learning_rate": 6.928934459426329e-07, + "loss": 0.8739, + "step": 10441 + }, + { + "epoch": 4.733454215775159, + "grad_norm": 0.07354443500359552, + "learning_rate": 6.905494514945288e-07, + "loss": 0.867, + "step": 10442 + }, + { + "epoch": 4.733907524932004, + "grad_norm": 0.07898059927503022, + "learning_rate": 6.882093940034295e-07, + "loss": 0.8809, + "step": 10443 + }, + { + "epoch": 4.734360834088848, + "grad_norm": 0.0756769855620378, + "learning_rate": 6.858732737036944e-07, + "loss": 0.8678, + "step": 10444 + }, + { + "epoch": 4.734814143245694, + "grad_norm": 0.07934903587965995, + "learning_rate": 6.83541090829296e-07, + "loss": 0.8458, + "step": 10445 + }, + { + "epoch": 4.735267452402539, + "grad_norm": 0.08223824701039745, + "learning_rate": 6.81212845613799e-07, + "loss": 0.8387, + "step": 10446 + }, + { + "epoch": 4.735720761559383, + "grad_norm": 0.07535503541508864, + "learning_rate": 6.788885382903987e-07, + "loss": 0.8374, + "step": 10447 + }, + { + "epoch": 4.7361740707162285, + "grad_norm": 0.08326748315579839, + "learning_rate": 6.76568169091869e-07, + "loss": 0.8421, + "step": 10448 + }, + { + "epoch": 4.736627379873074, + "grad_norm": 0.0848115702761386, + "learning_rate": 6.742517382506064e-07, + "loss": 0.8532, + "step": 10449 + }, + { + "epoch": 4.737080689029918, + "grad_norm": 0.07649352786383894, + "learning_rate": 6.719392459986074e-07, + "loss": 0.8472, + "step": 10450 + }, + { + "epoch": 4.7375339981867635, + "grad_norm": 0.07534884283139387, + "learning_rate": 6.696306925674823e-07, + "loss": 0.8327, + "step": 10451 + }, + { + "epoch": 4.737987307343609, + "grad_norm": 0.07485747987005162, + "learning_rate": 6.673260781884239e-07, + "loss": 0.8338, + "step": 10452 + }, + { + "epoch": 4.738440616500453, + "grad_norm": 0.08120245483293825, + "learning_rate": 6.650254030922654e-07, + "loss": 0.8505, + "step": 10453 + }, + { + "epoch": 4.738893925657298, + "grad_norm": 0.07234179197060245, + "learning_rate": 6.627286675094136e-07, + "loss": 0.8573, + "step": 10454 + }, + { + "epoch": 4.739347234814144, + "grad_norm": 0.08036760819389613, + "learning_rate": 6.604358716698889e-07, + "loss": 0.8638, + "step": 10455 + }, + { + "epoch": 4.739800543970988, + "grad_norm": 0.08301478338703364, + "learning_rate": 6.581470158033343e-07, + "loss": 0.8615, + "step": 10456 + }, + { + "epoch": 4.740253853127833, + "grad_norm": 0.0893914419713817, + "learning_rate": 6.558621001389798e-07, + "loss": 0.8832, + "step": 10457 + }, + { + "epoch": 4.740707162284679, + "grad_norm": 0.07861888514698764, + "learning_rate": 6.535811249056645e-07, + "loss": 0.8405, + "step": 10458 + }, + { + "epoch": 4.741160471441523, + "grad_norm": 0.06966403511768293, + "learning_rate": 6.513040903318368e-07, + "loss": 0.8319, + "step": 10459 + }, + { + "epoch": 4.741613780598368, + "grad_norm": 0.07500415936716984, + "learning_rate": 6.490309966455499e-07, + "loss": 0.8592, + "step": 10460 + }, + { + "epoch": 4.7420670897552135, + "grad_norm": 0.07132183186226346, + "learning_rate": 6.467618440744528e-07, + "loss": 0.8691, + "step": 10461 + }, + { + "epoch": 4.742520398912058, + "grad_norm": 0.07555084784091289, + "learning_rate": 6.444966328458214e-07, + "loss": 0.8527, + "step": 10462 + }, + { + "epoch": 4.742973708068903, + "grad_norm": 0.07569301285141769, + "learning_rate": 6.422353631865097e-07, + "loss": 0.8422, + "step": 10463 + }, + { + "epoch": 4.743427017225748, + "grad_norm": 0.08026104512926306, + "learning_rate": 6.399780353230078e-07, + "loss": 0.8548, + "step": 10464 + }, + { + "epoch": 4.743880326382593, + "grad_norm": 0.07883793746013695, + "learning_rate": 6.377246494813705e-07, + "loss": 0.8519, + "step": 10465 + }, + { + "epoch": 4.744333635539438, + "grad_norm": 0.07601300065054585, + "learning_rate": 6.354752058873015e-07, + "loss": 0.8524, + "step": 10466 + }, + { + "epoch": 4.7447869446962825, + "grad_norm": 0.07799654782702917, + "learning_rate": 6.332297047660741e-07, + "loss": 0.8393, + "step": 10467 + }, + { + "epoch": 4.745240253853128, + "grad_norm": 0.07641461477756234, + "learning_rate": 6.309881463425926e-07, + "loss": 0.8644, + "step": 10468 + }, + { + "epoch": 4.745693563009973, + "grad_norm": 0.07904918141246249, + "learning_rate": 6.287505308413533e-07, + "loss": 0.8295, + "step": 10469 + }, + { + "epoch": 4.746146872166817, + "grad_norm": 0.07315304142918497, + "learning_rate": 6.265168584864523e-07, + "loss": 0.8516, + "step": 10470 + }, + { + "epoch": 4.746600181323663, + "grad_norm": 0.07344575482879821, + "learning_rate": 6.242871295016084e-07, + "loss": 0.843, + "step": 10471 + }, + { + "epoch": 4.747053490480508, + "grad_norm": 0.07752381832299547, + "learning_rate": 6.22061344110132e-07, + "loss": 0.8731, + "step": 10472 + }, + { + "epoch": 4.747506799637352, + "grad_norm": 0.08038145951249226, + "learning_rate": 6.19839502534938e-07, + "loss": 0.8671, + "step": 10473 + }, + { + "epoch": 4.747960108794198, + "grad_norm": 0.07974451577006385, + "learning_rate": 6.17621604998555e-07, + "loss": 0.8687, + "step": 10474 + }, + { + "epoch": 4.748413417951043, + "grad_norm": 0.07680883762713935, + "learning_rate": 6.154076517231167e-07, + "loss": 0.8575, + "step": 10475 + }, + { + "epoch": 4.748866727107887, + "grad_norm": 0.07331722706189299, + "learning_rate": 6.131976429303432e-07, + "loss": 0.8392, + "step": 10476 + }, + { + "epoch": 4.7493200362647325, + "grad_norm": 0.07199634192291988, + "learning_rate": 6.109915788415865e-07, + "loss": 0.866, + "step": 10477 + }, + { + "epoch": 4.749773345421578, + "grad_norm": 0.07401310126589669, + "learning_rate": 6.08789459677781e-07, + "loss": 0.8514, + "step": 10478 + }, + { + "epoch": 4.750226654578422, + "grad_norm": 0.07897650425170653, + "learning_rate": 6.065912856594835e-07, + "loss": 0.8635, + "step": 10479 + }, + { + "epoch": 4.7506799637352675, + "grad_norm": 0.07590127064876985, + "learning_rate": 6.043970570068469e-07, + "loss": 0.8415, + "step": 10480 + }, + { + "epoch": 4.751133272892113, + "grad_norm": 0.0736002523997185, + "learning_rate": 6.022067739396198e-07, + "loss": 0.8464, + "step": 10481 + }, + { + "epoch": 4.751586582048957, + "grad_norm": 0.08007762324982046, + "learning_rate": 6.000204366771778e-07, + "loss": 0.8589, + "step": 10482 + }, + { + "epoch": 4.752039891205802, + "grad_norm": 0.07421216031284801, + "learning_rate": 5.978380454384836e-07, + "loss": 0.8647, + "step": 10483 + }, + { + "epoch": 4.752493200362648, + "grad_norm": 0.07809310401445349, + "learning_rate": 5.956596004421045e-07, + "loss": 0.84, + "step": 10484 + }, + { + "epoch": 4.752946509519492, + "grad_norm": 0.07745303019247549, + "learning_rate": 5.93485101906226e-07, + "loss": 0.8559, + "step": 10485 + }, + { + "epoch": 4.753399818676337, + "grad_norm": 0.07544454681719287, + "learning_rate": 5.913145500486294e-07, + "loss": 0.8757, + "step": 10486 + }, + { + "epoch": 4.753853127833183, + "grad_norm": 0.07732400930510222, + "learning_rate": 5.89147945086701e-07, + "loss": 0.851, + "step": 10487 + }, + { + "epoch": 4.754306436990027, + "grad_norm": 0.08110728389671787, + "learning_rate": 5.869852872374315e-07, + "loss": 0.8427, + "step": 10488 + }, + { + "epoch": 4.754759746146872, + "grad_norm": 0.07186318353694464, + "learning_rate": 5.848265767174166e-07, + "loss": 0.8488, + "step": 10489 + }, + { + "epoch": 4.7552130553037175, + "grad_norm": 0.07711232757881586, + "learning_rate": 5.826718137428611e-07, + "loss": 0.858, + "step": 10490 + }, + { + "epoch": 4.755666364460562, + "grad_norm": 0.07741993373649399, + "learning_rate": 5.805209985295657e-07, + "loss": 0.8784, + "step": 10491 + }, + { + "epoch": 4.756119673617407, + "grad_norm": 0.0810916789955847, + "learning_rate": 5.783741312929448e-07, + "loss": 0.8625, + "step": 10492 + }, + { + "epoch": 4.756572982774252, + "grad_norm": 0.07667043494787465, + "learning_rate": 5.76231212248013e-07, + "loss": 0.8558, + "step": 10493 + }, + { + "epoch": 4.757026291931097, + "grad_norm": 0.07457640886694726, + "learning_rate": 5.740922416093941e-07, + "loss": 0.8652, + "step": 10494 + }, + { + "epoch": 4.757479601087942, + "grad_norm": 0.07435531532746703, + "learning_rate": 5.719572195913037e-07, + "loss": 0.8533, + "step": 10495 + }, + { + "epoch": 4.7579329102447865, + "grad_norm": 0.08153963138170127, + "learning_rate": 5.698261464075749e-07, + "loss": 0.8639, + "step": 10496 + }, + { + "epoch": 4.758386219401632, + "grad_norm": 0.07528055371913497, + "learning_rate": 5.676990222716416e-07, + "loss": 0.8408, + "step": 10497 + }, + { + "epoch": 4.758839528558477, + "grad_norm": 0.0755676004819849, + "learning_rate": 5.655758473965378e-07, + "loss": 0.8625, + "step": 10498 + }, + { + "epoch": 4.759292837715321, + "grad_norm": 0.07825481668258227, + "learning_rate": 5.634566219949111e-07, + "loss": 0.8442, + "step": 10499 + }, + { + "epoch": 4.759746146872167, + "grad_norm": 0.08271937025463008, + "learning_rate": 5.613413462790051e-07, + "loss": 0.846, + "step": 10500 + }, + { + "epoch": 4.760199456029012, + "grad_norm": 0.0739857765127675, + "learning_rate": 5.592300204606727e-07, + "loss": 0.8439, + "step": 10501 + }, + { + "epoch": 4.760652765185856, + "grad_norm": 0.07554211977250898, + "learning_rate": 5.571226447513667e-07, + "loss": 0.8546, + "step": 10502 + }, + { + "epoch": 4.761106074342702, + "grad_norm": 0.07337667804344974, + "learning_rate": 5.550192193621495e-07, + "loss": 0.8576, + "step": 10503 + }, + { + "epoch": 4.761559383499547, + "grad_norm": 0.07257076633808926, + "learning_rate": 5.529197445036838e-07, + "loss": 0.8308, + "step": 10504 + }, + { + "epoch": 4.762012692656391, + "grad_norm": 0.07786856115842997, + "learning_rate": 5.508242203862368e-07, + "loss": 0.8517, + "step": 10505 + }, + { + "epoch": 4.7624660018132365, + "grad_norm": 0.0760108962624538, + "learning_rate": 5.487326472196852e-07, + "loss": 0.8336, + "step": 10506 + }, + { + "epoch": 4.762919310970082, + "grad_norm": 0.07492875293579361, + "learning_rate": 5.466450252135014e-07, + "loss": 0.8453, + "step": 10507 + }, + { + "epoch": 4.763372620126926, + "grad_norm": 0.07851574937866686, + "learning_rate": 5.445613545767714e-07, + "loss": 0.8466, + "step": 10508 + }, + { + "epoch": 4.7638259292837715, + "grad_norm": 0.07084715336444697, + "learning_rate": 5.424816355181817e-07, + "loss": 0.8528, + "step": 10509 + }, + { + "epoch": 4.764279238440617, + "grad_norm": 0.07660642286730424, + "learning_rate": 5.40405868246019e-07, + "loss": 0.8598, + "step": 10510 + }, + { + "epoch": 4.764732547597461, + "grad_norm": 0.1511640215787473, + "learning_rate": 5.383340529681702e-07, + "loss": 0.8933, + "step": 10511 + }, + { + "epoch": 4.765185856754306, + "grad_norm": 0.07670433569521544, + "learning_rate": 5.362661898921495e-07, + "loss": 0.8621, + "step": 10512 + }, + { + "epoch": 4.765639165911152, + "grad_norm": 0.07267787512757802, + "learning_rate": 5.342022792250489e-07, + "loss": 0.8424, + "step": 10513 + }, + { + "epoch": 4.766092475067996, + "grad_norm": 0.0795237721359174, + "learning_rate": 5.321423211735788e-07, + "loss": 0.8562, + "step": 10514 + }, + { + "epoch": 4.766545784224841, + "grad_norm": 0.07434860526629525, + "learning_rate": 5.300863159440495e-07, + "loss": 0.8513, + "step": 10515 + }, + { + "epoch": 4.766999093381687, + "grad_norm": 0.07697098915974397, + "learning_rate": 5.280342637423764e-07, + "loss": 0.8525, + "step": 10516 + }, + { + "epoch": 4.767452402538531, + "grad_norm": 0.07383080471304142, + "learning_rate": 5.25986164774075e-07, + "loss": 0.8514, + "step": 10517 + }, + { + "epoch": 4.767905711695376, + "grad_norm": 0.0769177382082394, + "learning_rate": 5.239420192442746e-07, + "loss": 0.8467, + "step": 10518 + }, + { + "epoch": 4.7683590208522215, + "grad_norm": 0.08504503982818824, + "learning_rate": 5.219018273576959e-07, + "loss": 0.8723, + "step": 10519 + }, + { + "epoch": 4.768812330009066, + "grad_norm": 0.08137292319230614, + "learning_rate": 5.198655893186733e-07, + "loss": 0.8651, + "step": 10520 + }, + { + "epoch": 4.769265639165911, + "grad_norm": 0.07571850737013659, + "learning_rate": 5.178333053311458e-07, + "loss": 0.8578, + "step": 10521 + }, + { + "epoch": 4.7697189483227564, + "grad_norm": 0.07894891692802779, + "learning_rate": 5.158049755986483e-07, + "loss": 0.8558, + "step": 10522 + }, + { + "epoch": 4.770172257479601, + "grad_norm": 0.07514841957140501, + "learning_rate": 5.137806003243162e-07, + "loss": 0.8728, + "step": 10523 + }, + { + "epoch": 4.770625566636446, + "grad_norm": 0.07894543285598445, + "learning_rate": 5.117601797109162e-07, + "loss": 0.8493, + "step": 10524 + }, + { + "epoch": 4.771078875793291, + "grad_norm": 0.0832231692542738, + "learning_rate": 5.097437139607797e-07, + "loss": 0.8653, + "step": 10525 + }, + { + "epoch": 4.771532184950136, + "grad_norm": 0.08027250242223505, + "learning_rate": 5.077312032758741e-07, + "loss": 0.8493, + "step": 10526 + }, + { + "epoch": 4.771985494106981, + "grad_norm": 0.07630609514761255, + "learning_rate": 5.057226478577537e-07, + "loss": 0.8566, + "step": 10527 + }, + { + "epoch": 4.772438803263826, + "grad_norm": 0.08757662788640748, + "learning_rate": 5.037180479075776e-07, + "loss": 0.8583, + "step": 10528 + }, + { + "epoch": 4.772892112420671, + "grad_norm": 0.0781156069664937, + "learning_rate": 5.01717403626123e-07, + "loss": 0.8536, + "step": 10529 + }, + { + "epoch": 4.773345421577516, + "grad_norm": 0.08022692579934573, + "learning_rate": 4.997207152137451e-07, + "loss": 0.8579, + "step": 10530 + }, + { + "epoch": 4.773798730734361, + "grad_norm": 0.07762131949467059, + "learning_rate": 4.977279828704351e-07, + "loss": 0.8568, + "step": 10531 + }, + { + "epoch": 4.774252039891206, + "grad_norm": 0.0794565002500699, + "learning_rate": 4.957392067957578e-07, + "loss": 0.8673, + "step": 10532 + }, + { + "epoch": 4.774705349048051, + "grad_norm": 0.08137413392051468, + "learning_rate": 4.937543871889006e-07, + "loss": 0.8648, + "step": 10533 + }, + { + "epoch": 4.775158658204896, + "grad_norm": 0.0790527056174317, + "learning_rate": 4.917735242486465e-07, + "loss": 0.8443, + "step": 10534 + }, + { + "epoch": 4.7756119673617405, + "grad_norm": 0.0741559974550898, + "learning_rate": 4.897966181733926e-07, + "loss": 0.8579, + "step": 10535 + }, + { + "epoch": 4.776065276518586, + "grad_norm": 0.07152527341430225, + "learning_rate": 4.878236691611227e-07, + "loss": 0.8518, + "step": 10536 + }, + { + "epoch": 4.776518585675431, + "grad_norm": 0.07334600395009123, + "learning_rate": 4.858546774094341e-07, + "loss": 0.8457, + "step": 10537 + }, + { + "epoch": 4.7769718948322755, + "grad_norm": 0.07746425760699038, + "learning_rate": 4.838896431155338e-07, + "loss": 0.8864, + "step": 10538 + }, + { + "epoch": 4.777425203989121, + "grad_norm": 0.07343907609666422, + "learning_rate": 4.819285664762152e-07, + "loss": 0.8704, + "step": 10539 + }, + { + "epoch": 4.777878513145966, + "grad_norm": 0.07615175601094629, + "learning_rate": 4.799714476878992e-07, + "loss": 0.8495, + "step": 10540 + }, + { + "epoch": 4.77833182230281, + "grad_norm": 0.07255633398499954, + "learning_rate": 4.780182869465888e-07, + "loss": 0.8693, + "step": 10541 + }, + { + "epoch": 4.778785131459656, + "grad_norm": 0.07489185899722266, + "learning_rate": 4.760690844478966e-07, + "loss": 0.8575, + "step": 10542 + }, + { + "epoch": 4.779238440616501, + "grad_norm": 0.07290444811438955, + "learning_rate": 4.7412384038704405e-07, + "loss": 0.8524, + "step": 10543 + }, + { + "epoch": 4.779691749773345, + "grad_norm": 0.07666524288325594, + "learning_rate": 4.721825549588577e-07, + "loss": 0.8428, + "step": 10544 + }, + { + "epoch": 4.780145058930191, + "grad_norm": 0.07598220299015357, + "learning_rate": 4.702452283577508e-07, + "loss": 0.8613, + "step": 10545 + }, + { + "epoch": 4.780598368087035, + "grad_norm": 0.07398317084756213, + "learning_rate": 4.683118607777681e-07, + "loss": 0.8545, + "step": 10546 + }, + { + "epoch": 4.78105167724388, + "grad_norm": 0.08128229590766368, + "learning_rate": 4.663824524125282e-07, + "loss": 0.8568, + "step": 10547 + }, + { + "epoch": 4.7815049864007255, + "grad_norm": 0.07795099808103735, + "learning_rate": 4.6445700345527645e-07, + "loss": 0.8606, + "step": 10548 + }, + { + "epoch": 4.78195829555757, + "grad_norm": 0.0771730531069197, + "learning_rate": 4.6253551409884524e-07, + "loss": 0.8663, + "step": 10549 + }, + { + "epoch": 4.782411604714415, + "grad_norm": 0.07118765762938079, + "learning_rate": 4.606179845356851e-07, + "loss": 0.8349, + "step": 10550 + }, + { + "epoch": 4.7828649138712604, + "grad_norm": 0.07082770634996625, + "learning_rate": 4.587044149578379e-07, + "loss": 0.8599, + "step": 10551 + }, + { + "epoch": 4.783318223028105, + "grad_norm": 0.0835200730001397, + "learning_rate": 4.5679480555695044e-07, + "loss": 0.886, + "step": 10552 + }, + { + "epoch": 4.78377153218495, + "grad_norm": 0.07188384237363893, + "learning_rate": 4.548891565242786e-07, + "loss": 0.8488, + "step": 10553 + }, + { + "epoch": 4.784224841341795, + "grad_norm": 0.07650680548208624, + "learning_rate": 4.529874680506785e-07, + "loss": 0.8567, + "step": 10554 + }, + { + "epoch": 4.78467815049864, + "grad_norm": 0.07520304567961059, + "learning_rate": 4.5108974032661125e-07, + "loss": 0.8294, + "step": 10555 + }, + { + "epoch": 4.785131459655485, + "grad_norm": 0.07873729385159414, + "learning_rate": 4.4919597354213807e-07, + "loss": 0.8595, + "step": 10556 + }, + { + "epoch": 4.78558476881233, + "grad_norm": 0.07162361489410261, + "learning_rate": 4.4730616788692504e-07, + "loss": 0.8686, + "step": 10557 + }, + { + "epoch": 4.786038077969175, + "grad_norm": 0.07989593712074143, + "learning_rate": 4.45420323550243e-07, + "loss": 0.846, + "step": 10558 + }, + { + "epoch": 4.78649138712602, + "grad_norm": 0.07772931159681588, + "learning_rate": 4.435384407209675e-07, + "loss": 0.8489, + "step": 10559 + }, + { + "epoch": 4.786944696282865, + "grad_norm": 0.07940202666470482, + "learning_rate": 4.4166051958757006e-07, + "loss": 0.8603, + "step": 10560 + }, + { + "epoch": 4.78739800543971, + "grad_norm": 0.07977336696451057, + "learning_rate": 4.3978656033812683e-07, + "loss": 0.8646, + "step": 10561 + }, + { + "epoch": 4.787851314596555, + "grad_norm": 0.07474356539026387, + "learning_rate": 4.379165631603277e-07, + "loss": 0.8465, + "step": 10562 + }, + { + "epoch": 4.7883046237534, + "grad_norm": 0.07103931783238158, + "learning_rate": 4.3605052824145844e-07, + "loss": 0.8417, + "step": 10563 + }, + { + "epoch": 4.7887579329102445, + "grad_norm": 0.07819683456662062, + "learning_rate": 4.341884557684006e-07, + "loss": 0.8547, + "step": 10564 + }, + { + "epoch": 4.78921124206709, + "grad_norm": 0.07590302494534702, + "learning_rate": 4.32330345927654e-07, + "loss": 0.861, + "step": 10565 + }, + { + "epoch": 4.789664551223935, + "grad_norm": 0.08067512837932632, + "learning_rate": 4.3047619890530966e-07, + "loss": 0.8566, + "step": 10566 + }, + { + "epoch": 4.7901178603807795, + "grad_norm": 0.07323186030394178, + "learning_rate": 4.2862601488706355e-07, + "loss": 0.8584, + "step": 10567 + }, + { + "epoch": 4.790571169537625, + "grad_norm": 0.07168888057160597, + "learning_rate": 4.267797940582252e-07, + "loss": 0.8519, + "step": 10568 + }, + { + "epoch": 4.79102447869447, + "grad_norm": 0.07466380881614089, + "learning_rate": 4.249375366036912e-07, + "loss": 0.8671, + "step": 10569 + }, + { + "epoch": 4.791477787851314, + "grad_norm": 0.07425397913844196, + "learning_rate": 4.2309924270797166e-07, + "loss": 0.8687, + "step": 10570 + }, + { + "epoch": 4.79193109700816, + "grad_norm": 0.07508024237260279, + "learning_rate": 4.212649125551771e-07, + "loss": 0.8537, + "step": 10571 + }, + { + "epoch": 4.792384406165004, + "grad_norm": 0.0760413654069483, + "learning_rate": 4.194345463290228e-07, + "loss": 0.8483, + "step": 10572 + }, + { + "epoch": 4.792837715321849, + "grad_norm": 0.0805172968939755, + "learning_rate": 4.1760814421282437e-07, + "loss": 0.8486, + "step": 10573 + }, + { + "epoch": 4.793291024478695, + "grad_norm": 0.07876850346932358, + "learning_rate": 4.1578570638949765e-07, + "loss": 0.8694, + "step": 10574 + }, + { + "epoch": 4.793744333635539, + "grad_norm": 0.07144081061099838, + "learning_rate": 4.139672330415723e-07, + "loss": 0.827, + "step": 10575 + }, + { + "epoch": 4.794197642792384, + "grad_norm": 0.08303288043840508, + "learning_rate": 4.121527243511647e-07, + "loss": 0.8534, + "step": 10576 + }, + { + "epoch": 4.7946509519492295, + "grad_norm": 0.07746520522336597, + "learning_rate": 4.1034218050001406e-07, + "loss": 0.8537, + "step": 10577 + }, + { + "epoch": 4.795104261106074, + "grad_norm": 0.08035024202485781, + "learning_rate": 4.0853560166944196e-07, + "loss": 0.8498, + "step": 10578 + }, + { + "epoch": 4.795557570262919, + "grad_norm": 0.07912796108578765, + "learning_rate": 4.067329880403881e-07, + "loss": 0.8598, + "step": 10579 + }, + { + "epoch": 4.7960108794197644, + "grad_norm": 0.07773976075116856, + "learning_rate": 4.0493433979338805e-07, + "loss": 0.8613, + "step": 10580 + }, + { + "epoch": 4.796464188576609, + "grad_norm": 0.07825401756661789, + "learning_rate": 4.031396571085822e-07, + "loss": 0.869, + "step": 10581 + }, + { + "epoch": 4.796917497733454, + "grad_norm": 0.0773965919536164, + "learning_rate": 4.0134894016571114e-07, + "loss": 0.845, + "step": 10582 + }, + { + "epoch": 4.797370806890299, + "grad_norm": 0.07465401746772568, + "learning_rate": 3.9956218914412484e-07, + "loss": 0.8574, + "step": 10583 + }, + { + "epoch": 4.797824116047144, + "grad_norm": 0.07970021908177319, + "learning_rate": 3.977794042227645e-07, + "loss": 0.8437, + "step": 10584 + }, + { + "epoch": 4.798277425203989, + "grad_norm": 0.08634194852114035, + "learning_rate": 3.960005855801896e-07, + "loss": 0.8538, + "step": 10585 + }, + { + "epoch": 4.798730734360834, + "grad_norm": 0.07350104294157818, + "learning_rate": 3.942257333945465e-07, + "loss": 0.8707, + "step": 10586 + }, + { + "epoch": 4.799184043517679, + "grad_norm": 0.08220137971627617, + "learning_rate": 3.924548478435952e-07, + "loss": 0.8617, + "step": 10587 + }, + { + "epoch": 4.799637352674524, + "grad_norm": 0.07572609455653412, + "learning_rate": 3.906879291046961e-07, + "loss": 0.8376, + "step": 10588 + }, + { + "epoch": 4.800090661831369, + "grad_norm": 0.07807282391825306, + "learning_rate": 3.889249773548143e-07, + "loss": 0.8617, + "step": 10589 + }, + { + "epoch": 4.800543970988214, + "grad_norm": 0.07850968569978664, + "learning_rate": 3.871659927705063e-07, + "loss": 0.8519, + "step": 10590 + }, + { + "epoch": 4.800997280145059, + "grad_norm": 0.07347468170866071, + "learning_rate": 3.854109755279467e-07, + "loss": 0.8554, + "step": 10591 + }, + { + "epoch": 4.801450589301904, + "grad_norm": 0.07050128711719728, + "learning_rate": 3.8365992580290166e-07, + "loss": 0.8535, + "step": 10592 + }, + { + "epoch": 4.8019038984587485, + "grad_norm": 0.07136514806256974, + "learning_rate": 3.8191284377074646e-07, + "loss": 0.8498, + "step": 10593 + }, + { + "epoch": 4.802357207615594, + "grad_norm": 0.075558338851183, + "learning_rate": 3.801697296064566e-07, + "loss": 0.8712, + "step": 10594 + }, + { + "epoch": 4.802810516772439, + "grad_norm": 0.06804881187732856, + "learning_rate": 3.784305834846036e-07, + "loss": 0.8447, + "step": 10595 + }, + { + "epoch": 4.8032638259292835, + "grad_norm": 0.07205776610498552, + "learning_rate": 3.76695405579377e-07, + "loss": 0.8462, + "step": 10596 + }, + { + "epoch": 4.803717135086129, + "grad_norm": 0.07671585628715291, + "learning_rate": 3.749641960645534e-07, + "loss": 0.8611, + "step": 10597 + }, + { + "epoch": 4.804170444242974, + "grad_norm": 0.0762441015634757, + "learning_rate": 3.732369551135273e-07, + "loss": 0.8694, + "step": 10598 + }, + { + "epoch": 4.804623753399818, + "grad_norm": 0.07406659519511337, + "learning_rate": 3.71513682899276e-07, + "loss": 0.8517, + "step": 10599 + }, + { + "epoch": 4.805077062556664, + "grad_norm": 0.07744322307540814, + "learning_rate": 3.697943795943992e-07, + "loss": 0.8516, + "step": 10600 + }, + { + "epoch": 4.805530371713509, + "grad_norm": 0.08077076030860791, + "learning_rate": 3.6807904537108363e-07, + "loss": 0.8415, + "step": 10601 + }, + { + "epoch": 4.805983680870353, + "grad_norm": 0.073349962749365, + "learning_rate": 3.6636768040112957e-07, + "loss": 0.8655, + "step": 10602 + }, + { + "epoch": 4.806436990027199, + "grad_norm": 0.07776605266207415, + "learning_rate": 3.6466028485592887e-07, + "loss": 0.8399, + "step": 10603 + }, + { + "epoch": 4.806890299184044, + "grad_norm": 0.08157033467350866, + "learning_rate": 3.629568589064913e-07, + "loss": 0.8553, + "step": 10604 + }, + { + "epoch": 4.807343608340888, + "grad_norm": 0.07695257776204369, + "learning_rate": 3.6125740272341393e-07, + "loss": 0.8421, + "step": 10605 + }, + { + "epoch": 4.8077969174977335, + "grad_norm": 0.07785761415156066, + "learning_rate": 3.595619164769026e-07, + "loss": 0.8361, + "step": 10606 + }, + { + "epoch": 4.808250226654579, + "grad_norm": 0.0756377392213603, + "learning_rate": 3.578704003367683e-07, + "loss": 0.8607, + "step": 10607 + }, + { + "epoch": 4.808703535811423, + "grad_norm": 0.07249537657988868, + "learning_rate": 3.5618285447241773e-07, + "loss": 0.8525, + "step": 10608 + }, + { + "epoch": 4.8091568449682685, + "grad_norm": 0.07111079283916046, + "learning_rate": 3.544992790528712e-07, + "loss": 0.8586, + "step": 10609 + }, + { + "epoch": 4.809610154125114, + "grad_norm": 0.07071904986922418, + "learning_rate": 3.528196742467316e-07, + "loss": 0.848, + "step": 10610 + }, + { + "epoch": 4.810063463281958, + "grad_norm": 0.07378072269014667, + "learning_rate": 3.511440402222244e-07, + "loss": 0.8365, + "step": 10611 + }, + { + "epoch": 4.810516772438803, + "grad_norm": 0.0729439734318188, + "learning_rate": 3.4947237714716643e-07, + "loss": 0.8428, + "step": 10612 + }, + { + "epoch": 4.810970081595649, + "grad_norm": 0.1009101703150573, + "learning_rate": 3.478046851889794e-07, + "loss": 0.838, + "step": 10613 + }, + { + "epoch": 4.811423390752493, + "grad_norm": 0.07726375621212661, + "learning_rate": 3.4614096451468957e-07, + "loss": 0.8565, + "step": 10614 + }, + { + "epoch": 4.811876699909338, + "grad_norm": 0.06913378955560652, + "learning_rate": 3.4448121529092383e-07, + "loss": 0.8514, + "step": 10615 + }, + { + "epoch": 4.812330009066184, + "grad_norm": 0.08076261677125816, + "learning_rate": 3.428254376839091e-07, + "loss": 0.8647, + "step": 10616 + }, + { + "epoch": 4.812783318223028, + "grad_norm": 0.07585617110104936, + "learning_rate": 3.411736318594816e-07, + "loss": 0.8622, + "step": 10617 + }, + { + "epoch": 4.813236627379873, + "grad_norm": 0.07112830338869926, + "learning_rate": 3.395257979830646e-07, + "loss": 0.823, + "step": 10618 + }, + { + "epoch": 4.8136899365367185, + "grad_norm": 0.07023564688005764, + "learning_rate": 3.3788193621970387e-07, + "loss": 0.8398, + "step": 10619 + }, + { + "epoch": 4.814143245693563, + "grad_norm": 0.07321940298621722, + "learning_rate": 3.3624204673402325e-07, + "loss": 0.8707, + "step": 10620 + }, + { + "epoch": 4.814596554850408, + "grad_norm": 0.0791274006014912, + "learning_rate": 3.3460612969027806e-07, + "loss": 0.8592, + "step": 10621 + }, + { + "epoch": 4.815049864007253, + "grad_norm": 0.08176791907383746, + "learning_rate": 3.329741852523016e-07, + "loss": 0.8695, + "step": 10622 + }, + { + "epoch": 4.815503173164098, + "grad_norm": 0.07283313477737398, + "learning_rate": 3.31346213583541e-07, + "loss": 0.8557, + "step": 10623 + }, + { + "epoch": 4.815956482320943, + "grad_norm": 0.0757155323168614, + "learning_rate": 3.297222148470436e-07, + "loss": 0.8436, + "step": 10624 + }, + { + "epoch": 4.8164097914777875, + "grad_norm": 0.10626098747611726, + "learning_rate": 3.2810218920544814e-07, + "loss": 0.8507, + "step": 10625 + }, + { + "epoch": 4.816863100634633, + "grad_norm": 0.0804927543551598, + "learning_rate": 3.264861368210159e-07, + "loss": 0.8501, + "step": 10626 + }, + { + "epoch": 4.817316409791478, + "grad_norm": 0.07709050699196426, + "learning_rate": 3.2487405785559536e-07, + "loss": 0.8419, + "step": 10627 + }, + { + "epoch": 4.817769718948322, + "grad_norm": 0.07305036106450842, + "learning_rate": 3.232659524706394e-07, + "loss": 0.8536, + "step": 10628 + }, + { + "epoch": 4.818223028105168, + "grad_norm": 0.06865484307403136, + "learning_rate": 3.21661820827206e-07, + "loss": 0.8551, + "step": 10629 + }, + { + "epoch": 4.818676337262013, + "grad_norm": 0.07399136717368213, + "learning_rate": 3.2006166308595766e-07, + "loss": 0.842, + "step": 10630 + }, + { + "epoch": 4.819129646418857, + "grad_norm": 0.0768567177197009, + "learning_rate": 3.1846547940714844e-07, + "loss": 0.8456, + "step": 10631 + }, + { + "epoch": 4.819582955575703, + "grad_norm": 0.07304538025416549, + "learning_rate": 3.168732699506416e-07, + "loss": 0.8481, + "step": 10632 + }, + { + "epoch": 4.820036264732548, + "grad_norm": 0.07968180501872246, + "learning_rate": 3.152850348759051e-07, + "loss": 0.8566, + "step": 10633 + }, + { + "epoch": 4.820489573889392, + "grad_norm": 0.07125220781966457, + "learning_rate": 3.1370077434200285e-07, + "loss": 0.8536, + "step": 10634 + }, + { + "epoch": 4.8209428830462375, + "grad_norm": 0.0730712364534785, + "learning_rate": 3.121204885076079e-07, + "loss": 0.8453, + "step": 10635 + }, + { + "epoch": 4.821396192203083, + "grad_norm": 0.07211460708568221, + "learning_rate": 3.105441775309803e-07, + "loss": 0.8553, + "step": 10636 + }, + { + "epoch": 4.821849501359927, + "grad_norm": 0.07183730005728334, + "learning_rate": 3.089718415700027e-07, + "loss": 0.8803, + "step": 10637 + }, + { + "epoch": 4.8223028105167725, + "grad_norm": 0.0695076983676156, + "learning_rate": 3.0740348078214465e-07, + "loss": 0.8588, + "step": 10638 + }, + { + "epoch": 4.822756119673618, + "grad_norm": 0.07128581199071626, + "learning_rate": 3.058390953244805e-07, + "loss": 0.8488, + "step": 10639 + }, + { + "epoch": 4.823209428830462, + "grad_norm": 0.08217905512709742, + "learning_rate": 3.042786853536894e-07, + "loss": 0.8604, + "step": 10640 + }, + { + "epoch": 4.823662737987307, + "grad_norm": 0.07392238877954502, + "learning_rate": 3.027222510260552e-07, + "loss": 0.8509, + "step": 10641 + }, + { + "epoch": 4.824116047144153, + "grad_norm": 0.07452317186071598, + "learning_rate": 3.0116979249745327e-07, + "loss": 0.8549, + "step": 10642 + }, + { + "epoch": 4.824569356300997, + "grad_norm": 0.07269909478080938, + "learning_rate": 2.99621309923368e-07, + "loss": 0.8557, + "step": 10643 + }, + { + "epoch": 4.825022665457842, + "grad_norm": 0.07305827368741998, + "learning_rate": 2.980768034588888e-07, + "loss": 0.8702, + "step": 10644 + }, + { + "epoch": 4.825475974614688, + "grad_norm": 0.07630754711921747, + "learning_rate": 2.9653627325869626e-07, + "loss": 0.8589, + "step": 10645 + }, + { + "epoch": 4.825929283771532, + "grad_norm": 0.0708990351630464, + "learning_rate": 2.949997194770848e-07, + "loss": 0.8513, + "step": 10646 + }, + { + "epoch": 4.826382592928377, + "grad_norm": 0.07240778450517246, + "learning_rate": 2.9346714226794024e-07, + "loss": 0.8576, + "step": 10647 + }, + { + "epoch": 4.8268359020852225, + "grad_norm": 0.06946274272726563, + "learning_rate": 2.9193854178475756e-07, + "loss": 0.8417, + "step": 10648 + }, + { + "epoch": 4.827289211242067, + "grad_norm": 0.083641469641881, + "learning_rate": 2.9041391818063204e-07, + "loss": 0.8508, + "step": 10649 + }, + { + "epoch": 4.827742520398912, + "grad_norm": 0.07726465132188642, + "learning_rate": 2.888932716082549e-07, + "loss": 0.8342, + "step": 10650 + }, + { + "epoch": 4.8281958295557565, + "grad_norm": 0.07320169240931247, + "learning_rate": 2.8737660221992647e-07, + "loss": 0.847, + "step": 10651 + }, + { + "epoch": 4.828649138712602, + "grad_norm": 0.0673666162077393, + "learning_rate": 2.858639101675431e-07, + "loss": 0.8413, + "step": 10652 + }, + { + "epoch": 4.829102447869447, + "grad_norm": 0.07084912198162854, + "learning_rate": 2.8435519560260584e-07, + "loss": 0.8714, + "step": 10653 + }, + { + "epoch": 4.8295557570262915, + "grad_norm": 0.07858547271437243, + "learning_rate": 2.8285045867622043e-07, + "loss": 0.851, + "step": 10654 + }, + { + "epoch": 4.830009066183137, + "grad_norm": 0.079466737667044, + "learning_rate": 2.813496995390841e-07, + "loss": 0.8558, + "step": 10655 + }, + { + "epoch": 4.830462375339982, + "grad_norm": 0.07529649332141913, + "learning_rate": 2.798529183415122e-07, + "loss": 0.856, + "step": 10656 + }, + { + "epoch": 4.830915684496826, + "grad_norm": 0.07400382069625212, + "learning_rate": 2.783601152334026e-07, + "loss": 0.8545, + "step": 10657 + }, + { + "epoch": 4.831368993653672, + "grad_norm": 0.06924018259479138, + "learning_rate": 2.7687129036427116e-07, + "loss": 0.8572, + "step": 10658 + }, + { + "epoch": 4.831822302810517, + "grad_norm": 0.0762140605537139, + "learning_rate": 2.7538644388321655e-07, + "loss": 0.8397, + "step": 10659 + }, + { + "epoch": 4.832275611967361, + "grad_norm": 0.08266526690279613, + "learning_rate": 2.7390557593896417e-07, + "loss": 0.8545, + "step": 10660 + }, + { + "epoch": 4.832728921124207, + "grad_norm": 0.07059973303728835, + "learning_rate": 2.724286866798176e-07, + "loss": 0.8546, + "step": 10661 + }, + { + "epoch": 4.833182230281052, + "grad_norm": 0.07247434651489526, + "learning_rate": 2.709557762536985e-07, + "loss": 0.862, + "step": 10662 + }, + { + "epoch": 4.833635539437896, + "grad_norm": 0.07590481229895543, + "learning_rate": 2.694868448081156e-07, + "loss": 0.8534, + "step": 10663 + }, + { + "epoch": 4.8340888485947415, + "grad_norm": 0.07158351262276888, + "learning_rate": 2.680218924901956e-07, + "loss": 0.861, + "step": 10664 + }, + { + "epoch": 4.834542157751587, + "grad_norm": 0.07349852651626096, + "learning_rate": 2.665609194466523e-07, + "loss": 0.871, + "step": 10665 + }, + { + "epoch": 4.834995466908431, + "grad_norm": 0.08023785676990007, + "learning_rate": 2.651039258238042e-07, + "loss": 0.8482, + "step": 10666 + }, + { + "epoch": 4.8354487760652765, + "grad_norm": 0.07113523782983747, + "learning_rate": 2.6365091176757896e-07, + "loss": 0.8465, + "step": 10667 + }, + { + "epoch": 4.835902085222122, + "grad_norm": 0.07660718745448673, + "learning_rate": 2.622018774234958e-07, + "loss": 0.8645, + "step": 10668 + }, + { + "epoch": 4.836355394378966, + "grad_norm": 0.0778871041358519, + "learning_rate": 2.6075682293668305e-07, + "loss": 0.8615, + "step": 10669 + }, + { + "epoch": 4.836808703535811, + "grad_norm": 0.07164928095019073, + "learning_rate": 2.593157484518649e-07, + "loss": 0.8412, + "step": 10670 + }, + { + "epoch": 4.837262012692657, + "grad_norm": 0.07217912096789375, + "learning_rate": 2.5787865411337485e-07, + "loss": 0.8538, + "step": 10671 + }, + { + "epoch": 4.837715321849501, + "grad_norm": 0.07993374806933226, + "learning_rate": 2.5644554006513336e-07, + "loss": 0.8562, + "step": 10672 + }, + { + "epoch": 4.838168631006346, + "grad_norm": 0.07661848110338534, + "learning_rate": 2.5501640645067436e-07, + "loss": 0.8559, + "step": 10673 + }, + { + "epoch": 4.838621940163192, + "grad_norm": 0.07083216365925703, + "learning_rate": 2.535912534131324e-07, + "loss": 0.8458, + "step": 10674 + }, + { + "epoch": 4.839075249320036, + "grad_norm": 0.06996104339695314, + "learning_rate": 2.521700810952421e-07, + "loss": 0.858, + "step": 10675 + }, + { + "epoch": 4.839528558476881, + "grad_norm": 0.0706021759976011, + "learning_rate": 2.5075288963932966e-07, + "loss": 0.8567, + "step": 10676 + }, + { + "epoch": 4.8399818676337265, + "grad_norm": 0.0737314903076706, + "learning_rate": 2.4933967918733926e-07, + "loss": 0.8559, + "step": 10677 + }, + { + "epoch": 4.840435176790571, + "grad_norm": 0.07627172502809222, + "learning_rate": 2.479304498808066e-07, + "loss": 0.8552, + "step": 10678 + }, + { + "epoch": 4.840888485947416, + "grad_norm": 0.07753788600652443, + "learning_rate": 2.465252018608633e-07, + "loss": 0.8331, + "step": 10679 + }, + { + "epoch": 4.841341795104261, + "grad_norm": 0.07201655863189586, + "learning_rate": 2.451239352682588e-07, + "loss": 0.8591, + "step": 10680 + }, + { + "epoch": 4.841795104261106, + "grad_norm": 0.07235717034737604, + "learning_rate": 2.4372665024332996e-07, + "loss": 0.8454, + "step": 10681 + }, + { + "epoch": 4.842248413417951, + "grad_norm": 0.07216359039211634, + "learning_rate": 2.423333469260136e-07, + "loss": 0.8465, + "step": 10682 + }, + { + "epoch": 4.842701722574796, + "grad_norm": 0.07514034228594421, + "learning_rate": 2.409440254558604e-07, + "loss": 0.8445, + "step": 10683 + }, + { + "epoch": 4.843155031731641, + "grad_norm": 0.07133814850068525, + "learning_rate": 2.395586859720167e-07, + "loss": 0.8477, + "step": 10684 + }, + { + "epoch": 4.843608340888486, + "grad_norm": 0.07390365796908496, + "learning_rate": 2.3817732861322051e-07, + "loss": 0.8627, + "step": 10685 + }, + { + "epoch": 4.844061650045331, + "grad_norm": 0.07175729429883618, + "learning_rate": 2.3679995351782337e-07, + "loss": 0.8511, + "step": 10686 + }, + { + "epoch": 4.844514959202176, + "grad_norm": 0.07023921985412834, + "learning_rate": 2.3542656082377268e-07, + "loss": 0.848, + "step": 10687 + }, + { + "epoch": 4.844968268359021, + "grad_norm": 0.07157324801912039, + "learning_rate": 2.3405715066861623e-07, + "loss": 0.8415, + "step": 10688 + }, + { + "epoch": 4.845421577515866, + "grad_norm": 0.07506583809344339, + "learning_rate": 2.326917231895065e-07, + "loss": 0.8669, + "step": 10689 + }, + { + "epoch": 4.845874886672711, + "grad_norm": 0.07366659749313301, + "learning_rate": 2.3133027852319634e-07, + "loss": 0.8749, + "step": 10690 + }, + { + "epoch": 4.846328195829556, + "grad_norm": 0.07085322624549961, + "learning_rate": 2.299728168060389e-07, + "loss": 0.8544, + "step": 10691 + }, + { + "epoch": 4.846781504986401, + "grad_norm": 0.07163161911448132, + "learning_rate": 2.286193381739832e-07, + "loss": 0.8401, + "step": 10692 + }, + { + "epoch": 4.8472348141432455, + "grad_norm": 0.0707708762797177, + "learning_rate": 2.2726984276258745e-07, + "loss": 0.851, + "step": 10693 + }, + { + "epoch": 4.847688123300091, + "grad_norm": 0.07434283358870114, + "learning_rate": 2.259243307070058e-07, + "loss": 0.8539, + "step": 10694 + }, + { + "epoch": 4.848141432456936, + "grad_norm": 0.07076093972822571, + "learning_rate": 2.2458280214199712e-07, + "loss": 0.8507, + "step": 10695 + }, + { + "epoch": 4.8485947416137805, + "grad_norm": 0.07336958451814526, + "learning_rate": 2.2324525720191615e-07, + "loss": 0.8552, + "step": 10696 + }, + { + "epoch": 4.849048050770626, + "grad_norm": 0.07410594612203467, + "learning_rate": 2.219116960207268e-07, + "loss": 0.8618, + "step": 10697 + }, + { + "epoch": 4.849501359927471, + "grad_norm": 0.06890300461308638, + "learning_rate": 2.2058211873198897e-07, + "loss": 0.8513, + "step": 10698 + }, + { + "epoch": 4.849954669084315, + "grad_norm": 0.07915171053486733, + "learning_rate": 2.1925652546885835e-07, + "loss": 0.856, + "step": 10699 + }, + { + "epoch": 4.850407978241161, + "grad_norm": 0.08244794191182789, + "learning_rate": 2.179349163640998e-07, + "loss": 0.8551, + "step": 10700 + }, + { + "epoch": 4.850861287398006, + "grad_norm": 0.07262602560953242, + "learning_rate": 2.166172915500786e-07, + "loss": 0.8351, + "step": 10701 + }, + { + "epoch": 4.85131459655485, + "grad_norm": 0.07375762711241912, + "learning_rate": 2.153036511587514e-07, + "loss": 0.8672, + "step": 10702 + }, + { + "epoch": 4.851767905711696, + "grad_norm": 0.072299238150799, + "learning_rate": 2.139939953216974e-07, + "loss": 0.8554, + "step": 10703 + }, + { + "epoch": 4.85222121486854, + "grad_norm": 0.07485344088517866, + "learning_rate": 2.1268832417006501e-07, + "loss": 0.8433, + "step": 10704 + }, + { + "epoch": 4.852674524025385, + "grad_norm": 0.07692378318815267, + "learning_rate": 2.113866378346341e-07, + "loss": 0.8536, + "step": 10705 + }, + { + "epoch": 4.8531278331822305, + "grad_norm": 0.07297818510502732, + "learning_rate": 2.1008893644577145e-07, + "loss": 0.8629, + "step": 10706 + }, + { + "epoch": 4.853581142339075, + "grad_norm": 0.08302379024089969, + "learning_rate": 2.0879522013343534e-07, + "loss": 0.8541, + "step": 10707 + }, + { + "epoch": 4.85403445149592, + "grad_norm": 0.07803191467137612, + "learning_rate": 2.0750548902720657e-07, + "loss": 0.8608, + "step": 10708 + }, + { + "epoch": 4.854487760652765, + "grad_norm": 0.07303432155364714, + "learning_rate": 2.062197432562485e-07, + "loss": 0.8507, + "step": 10709 + }, + { + "epoch": 4.85494106980961, + "grad_norm": 0.0720657172391939, + "learning_rate": 2.0493798294933808e-07, + "loss": 0.8503, + "step": 10710 + }, + { + "epoch": 4.855394378966455, + "grad_norm": 0.07437182711838772, + "learning_rate": 2.0366020823484377e-07, + "loss": 0.8603, + "step": 10711 + }, + { + "epoch": 4.8558476881233, + "grad_norm": 0.07279360293771354, + "learning_rate": 2.0238641924073432e-07, + "loss": 0.8692, + "step": 10712 + }, + { + "epoch": 4.856300997280145, + "grad_norm": 0.07106302301534272, + "learning_rate": 2.011166160945921e-07, + "loss": 0.8518, + "step": 10713 + }, + { + "epoch": 4.85675430643699, + "grad_norm": 0.0808991473703073, + "learning_rate": 1.9985079892359095e-07, + "loss": 0.8482, + "step": 10714 + }, + { + "epoch": 4.857207615593835, + "grad_norm": 0.06933994864260502, + "learning_rate": 1.985889678544961e-07, + "loss": 0.847, + "step": 10715 + }, + { + "epoch": 4.85766092475068, + "grad_norm": 0.07412468859651401, + "learning_rate": 1.9733112301369094e-07, + "loss": 0.8534, + "step": 10716 + }, + { + "epoch": 4.858114233907525, + "grad_norm": 0.07632311332178879, + "learning_rate": 1.960772645271547e-07, + "loss": 0.8517, + "step": 10717 + }, + { + "epoch": 4.85856754306437, + "grad_norm": 0.07024256796405541, + "learning_rate": 1.9482739252046246e-07, + "loss": 0.8501, + "step": 10718 + }, + { + "epoch": 4.859020852221215, + "grad_norm": 0.07315079399653511, + "learning_rate": 1.9358150711878965e-07, + "loss": 0.8553, + "step": 10719 + }, + { + "epoch": 4.85947416137806, + "grad_norm": 0.075475693189387, + "learning_rate": 1.9233960844691647e-07, + "loss": 0.8465, + "step": 10720 + }, + { + "epoch": 4.859927470534905, + "grad_norm": 0.06929270134030718, + "learning_rate": 1.9110169662922784e-07, + "loss": 0.8471, + "step": 10721 + }, + { + "epoch": 4.8603807796917495, + "grad_norm": 0.07196233164573536, + "learning_rate": 1.8986777178969573e-07, + "loss": 0.87, + "step": 10722 + }, + { + "epoch": 4.860834088848595, + "grad_norm": 0.0761974107053024, + "learning_rate": 1.8863783405191015e-07, + "loss": 0.8514, + "step": 10723 + }, + { + "epoch": 4.86128739800544, + "grad_norm": 0.07715236852190413, + "learning_rate": 1.8741188353904815e-07, + "loss": 0.88, + "step": 10724 + }, + { + "epoch": 4.8617407071622845, + "grad_norm": 0.07215071630267395, + "learning_rate": 1.861899203738915e-07, + "loss": 0.8453, + "step": 10725 + }, + { + "epoch": 4.86219401631913, + "grad_norm": 0.07350771422066332, + "learning_rate": 1.8497194467882674e-07, + "loss": 0.8551, + "step": 10726 + }, + { + "epoch": 4.862647325475975, + "grad_norm": 0.07176095350706178, + "learning_rate": 1.837579565758363e-07, + "loss": 0.844, + "step": 10727 + }, + { + "epoch": 4.863100634632819, + "grad_norm": 0.08198506168102558, + "learning_rate": 1.825479561865029e-07, + "loss": 0.8648, + "step": 10728 + }, + { + "epoch": 4.863553943789665, + "grad_norm": 0.07428456173927125, + "learning_rate": 1.8134194363201407e-07, + "loss": 0.8656, + "step": 10729 + }, + { + "epoch": 4.864007252946509, + "grad_norm": 0.08163228547138762, + "learning_rate": 1.801399190331532e-07, + "loss": 0.8552, + "step": 10730 + }, + { + "epoch": 4.864460562103354, + "grad_norm": 0.07202102924447643, + "learning_rate": 1.7894188251030843e-07, + "loss": 0.861, + "step": 10731 + }, + { + "epoch": 4.8649138712602, + "grad_norm": 0.0733916190853858, + "learning_rate": 1.777478341834682e-07, + "loss": 0.8679, + "step": 10732 + }, + { + "epoch": 4.865367180417044, + "grad_norm": 0.07489245077291605, + "learning_rate": 1.7655777417221243e-07, + "loss": 0.8648, + "step": 10733 + }, + { + "epoch": 4.865820489573889, + "grad_norm": 0.0717453619722988, + "learning_rate": 1.753717025957391e-07, + "loss": 0.8597, + "step": 10734 + }, + { + "epoch": 4.8662737987307345, + "grad_norm": 0.06870521111242898, + "learning_rate": 1.741896195728332e-07, + "loss": 0.8626, + "step": 10735 + }, + { + "epoch": 4.866727107887579, + "grad_norm": 0.07787869504243577, + "learning_rate": 1.7301152522187558e-07, + "loss": 0.8601, + "step": 10736 + }, + { + "epoch": 4.867180417044424, + "grad_norm": 0.0727364990884962, + "learning_rate": 1.7183741966086965e-07, + "loss": 0.87, + "step": 10737 + }, + { + "epoch": 4.867633726201269, + "grad_norm": 0.0742387389334784, + "learning_rate": 1.7066730300739686e-07, + "loss": 0.8417, + "step": 10738 + }, + { + "epoch": 4.868087035358114, + "grad_norm": 0.06803881897548318, + "learning_rate": 1.695011753786524e-07, + "loss": 0.875, + "step": 10739 + }, + { + "epoch": 4.868540344514959, + "grad_norm": 0.07605041604558359, + "learning_rate": 1.6833903689142283e-07, + "loss": 0.8624, + "step": 10740 + }, + { + "epoch": 4.868993653671804, + "grad_norm": 0.07132342302872954, + "learning_rate": 1.671808876620995e-07, + "loss": 0.8545, + "step": 10741 + }, + { + "epoch": 4.869446962828649, + "grad_norm": 0.07176085954483293, + "learning_rate": 1.660267278066785e-07, + "loss": 0.8495, + "step": 10742 + }, + { + "epoch": 4.869900271985494, + "grad_norm": 0.07128929299253885, + "learning_rate": 1.6487655744075183e-07, + "loss": 0.8617, + "step": 10743 + }, + { + "epoch": 4.870353581142339, + "grad_norm": 0.07490505558390963, + "learning_rate": 1.6373037667951176e-07, + "loss": 0.86, + "step": 10744 + }, + { + "epoch": 4.870806890299184, + "grad_norm": 0.07822107359495459, + "learning_rate": 1.6258818563774647e-07, + "loss": 0.855, + "step": 10745 + }, + { + "epoch": 4.871260199456029, + "grad_norm": 0.07767774921353122, + "learning_rate": 1.6144998442986226e-07, + "loss": 0.8494, + "step": 10746 + }, + { + "epoch": 4.871713508612874, + "grad_norm": 0.07216211055719658, + "learning_rate": 1.6031577316983904e-07, + "loss": 0.8445, + "step": 10747 + }, + { + "epoch": 4.872166817769719, + "grad_norm": 0.07633521591662966, + "learning_rate": 1.5918555197127928e-07, + "loss": 0.8395, + "step": 10748 + }, + { + "epoch": 4.872620126926564, + "grad_norm": 0.07622888103312149, + "learning_rate": 1.5805932094737686e-07, + "loss": 0.8566, + "step": 10749 + }, + { + "epoch": 4.873073436083409, + "grad_norm": 0.07352089070407476, + "learning_rate": 1.5693708021092602e-07, + "loss": 0.862, + "step": 10750 + }, + { + "epoch": 4.8735267452402535, + "grad_norm": 0.06905761966654005, + "learning_rate": 1.5581882987432574e-07, + "loss": 0.8541, + "step": 10751 + }, + { + "epoch": 4.873980054397099, + "grad_norm": 0.07734810408521509, + "learning_rate": 1.547045700495664e-07, + "loss": 0.8477, + "step": 10752 + }, + { + "epoch": 4.874433363553944, + "grad_norm": 0.06942633572042792, + "learning_rate": 1.5359430084825211e-07, + "loss": 0.8669, + "step": 10753 + }, + { + "epoch": 4.8748866727107885, + "grad_norm": 0.07054863529725326, + "learning_rate": 1.5248802238156945e-07, + "loss": 0.8563, + "step": 10754 + }, + { + "epoch": 4.875339981867634, + "grad_norm": 0.07115085150740884, + "learning_rate": 1.5138573476032315e-07, + "loss": 0.8462, + "step": 10755 + }, + { + "epoch": 4.875793291024479, + "grad_norm": 0.07107688359643907, + "learning_rate": 1.502874380949093e-07, + "loss": 0.8491, + "step": 10756 + }, + { + "epoch": 4.876246600181323, + "grad_norm": 0.06653094239132214, + "learning_rate": 1.4919313249532441e-07, + "loss": 0.8452, + "step": 10757 + }, + { + "epoch": 4.876699909338169, + "grad_norm": 0.072300299547095, + "learning_rate": 1.4810281807116523e-07, + "loss": 0.8403, + "step": 10758 + }, + { + "epoch": 4.877153218495014, + "grad_norm": 0.0747090630444509, + "learning_rate": 1.470164949316333e-07, + "loss": 0.8645, + "step": 10759 + }, + { + "epoch": 4.877606527651858, + "grad_norm": 0.07245168357398835, + "learning_rate": 1.4593416318552155e-07, + "loss": 0.8437, + "step": 10760 + }, + { + "epoch": 4.878059836808704, + "grad_norm": 0.10715225626533686, + "learning_rate": 1.4485582294123667e-07, + "loss": 0.8589, + "step": 10761 + }, + { + "epoch": 4.878513145965549, + "grad_norm": 0.06973129274357574, + "learning_rate": 1.4378147430677226e-07, + "loss": 0.8567, + "step": 10762 + }, + { + "epoch": 4.878966455122393, + "grad_norm": 0.07190850211479011, + "learning_rate": 1.4271111738972665e-07, + "loss": 0.8656, + "step": 10763 + }, + { + "epoch": 4.8794197642792385, + "grad_norm": 0.07010577181656762, + "learning_rate": 1.41644752297303e-07, + "loss": 0.8507, + "step": 10764 + }, + { + "epoch": 4.879873073436084, + "grad_norm": 0.07421681782693203, + "learning_rate": 1.4058237913629592e-07, + "loss": 0.8392, + "step": 10765 + }, + { + "epoch": 4.880326382592928, + "grad_norm": 0.06786534354850841, + "learning_rate": 1.3952399801311357e-07, + "loss": 0.8579, + "step": 10766 + }, + { + "epoch": 4.880779691749773, + "grad_norm": 0.07278937221747114, + "learning_rate": 1.3846960903374673e-07, + "loss": 0.8421, + "step": 10767 + }, + { + "epoch": 4.881233000906619, + "grad_norm": 0.07446683234077713, + "learning_rate": 1.3741921230379985e-07, + "loss": 0.8741, + "step": 10768 + }, + { + "epoch": 4.881686310063463, + "grad_norm": 0.07258863961547853, + "learning_rate": 1.363728079284732e-07, + "loss": 0.8766, + "step": 10769 + }, + { + "epoch": 4.882139619220308, + "grad_norm": 0.07314324701401362, + "learning_rate": 1.3533039601256736e-07, + "loss": 0.8544, + "step": 10770 + }, + { + "epoch": 4.882592928377154, + "grad_norm": 0.07598097664296614, + "learning_rate": 1.3429197666047888e-07, + "loss": 0.87, + "step": 10771 + }, + { + "epoch": 4.883046237533998, + "grad_norm": 0.06940645875609563, + "learning_rate": 1.3325754997621788e-07, + "loss": 0.8628, + "step": 10772 + }, + { + "epoch": 4.883499546690843, + "grad_norm": 0.07463652942037988, + "learning_rate": 1.3222711606337258e-07, + "loss": 0.8415, + "step": 10773 + }, + { + "epoch": 4.883952855847689, + "grad_norm": 0.06904922201712146, + "learning_rate": 1.312006750251582e-07, + "loss": 0.8559, + "step": 10774 + }, + { + "epoch": 4.884406165004533, + "grad_norm": 0.07045310627735407, + "learning_rate": 1.3017822696436368e-07, + "loss": 0.8359, + "step": 10775 + }, + { + "epoch": 4.884859474161378, + "grad_norm": 0.06916058277536816, + "learning_rate": 1.291597719833959e-07, + "loss": 0.8578, + "step": 10776 + }, + { + "epoch": 4.8853127833182235, + "grad_norm": 0.07619750101344733, + "learning_rate": 1.281453101842578e-07, + "loss": 0.8613, + "step": 10777 + }, + { + "epoch": 4.885766092475068, + "grad_norm": 0.07842434085057706, + "learning_rate": 1.271348416685436e-07, + "loss": 0.8336, + "step": 10778 + }, + { + "epoch": 4.886219401631913, + "grad_norm": 0.07137842791426044, + "learning_rate": 1.261283665374613e-07, + "loss": 0.8448, + "step": 10779 + }, + { + "epoch": 4.886672710788758, + "grad_norm": 0.08205872239657327, + "learning_rate": 1.2512588489181021e-07, + "loss": 0.8564, + "step": 10780 + }, + { + "epoch": 4.887126019945603, + "grad_norm": 0.07281835039164415, + "learning_rate": 1.2412739683199448e-07, + "loss": 0.856, + "step": 10781 + }, + { + "epoch": 4.887579329102448, + "grad_norm": 0.07146858162216431, + "learning_rate": 1.2313290245800968e-07, + "loss": 0.8442, + "step": 10782 + }, + { + "epoch": 4.8880326382592925, + "grad_norm": 0.06744461454940753, + "learning_rate": 1.221424018694606e-07, + "loss": 0.8489, + "step": 10783 + }, + { + "epoch": 4.888485947416138, + "grad_norm": 0.07375554471411389, + "learning_rate": 1.2115589516554782e-07, + "loss": 0.8448, + "step": 10784 + }, + { + "epoch": 4.888939256572983, + "grad_norm": 0.0717683796219891, + "learning_rate": 1.201733824450768e-07, + "loss": 0.8642, + "step": 10785 + }, + { + "epoch": 4.889392565729827, + "grad_norm": 0.07434705372229741, + "learning_rate": 1.1919486380644441e-07, + "loss": 0.847, + "step": 10786 + }, + { + "epoch": 4.889845874886673, + "grad_norm": 0.07248890347765288, + "learning_rate": 1.1822033934765665e-07, + "loss": 0.8635, + "step": 10787 + }, + { + "epoch": 4.890299184043518, + "grad_norm": 0.07105306932936893, + "learning_rate": 1.1724980916631102e-07, + "loss": 0.8546, + "step": 10788 + }, + { + "epoch": 4.890752493200362, + "grad_norm": 0.07234318293532246, + "learning_rate": 1.1628327335960976e-07, + "loss": 0.8378, + "step": 10789 + }, + { + "epoch": 4.891205802357208, + "grad_norm": 0.07390802290436875, + "learning_rate": 1.1532073202435545e-07, + "loss": 0.8547, + "step": 10790 + }, + { + "epoch": 4.891659111514053, + "grad_norm": 0.07297352554981362, + "learning_rate": 1.1436218525694653e-07, + "loss": 0.8563, + "step": 10791 + }, + { + "epoch": 4.892112420670897, + "grad_norm": 0.07459613365070893, + "learning_rate": 1.1340763315338622e-07, + "loss": 0.8868, + "step": 10792 + }, + { + "epoch": 4.8925657298277425, + "grad_norm": 0.06908258425321039, + "learning_rate": 1.1245707580927801e-07, + "loss": 0.86, + "step": 10793 + }, + { + "epoch": 4.893019038984588, + "grad_norm": 0.07345747687536357, + "learning_rate": 1.1151051331982133e-07, + "loss": 0.846, + "step": 10794 + }, + { + "epoch": 4.893472348141432, + "grad_norm": 0.0737724804412717, + "learning_rate": 1.105679457798159e-07, + "loss": 0.8519, + "step": 10795 + }, + { + "epoch": 4.893925657298277, + "grad_norm": 0.07004231212954568, + "learning_rate": 1.0962937328366618e-07, + "loss": 0.8546, + "step": 10796 + }, + { + "epoch": 4.894378966455123, + "grad_norm": 0.07898874238530214, + "learning_rate": 1.0869479592536813e-07, + "loss": 0.8385, + "step": 10797 + }, + { + "epoch": 4.894832275611967, + "grad_norm": 0.0719455306084566, + "learning_rate": 1.0776421379852242e-07, + "loss": 0.8528, + "step": 10798 + }, + { + "epoch": 4.895285584768812, + "grad_norm": 0.06977613267398343, + "learning_rate": 1.0683762699633448e-07, + "loss": 0.8476, + "step": 10799 + }, + { + "epoch": 4.895738893925658, + "grad_norm": 0.07489617841395209, + "learning_rate": 1.0591503561160121e-07, + "loss": 0.8721, + "step": 10800 + }, + { + "epoch": 4.896192203082502, + "grad_norm": 0.06887357878827279, + "learning_rate": 1.0499643973672424e-07, + "loss": 0.8433, + "step": 10801 + }, + { + "epoch": 4.896645512239347, + "grad_norm": 0.07401814588849298, + "learning_rate": 1.040818394637011e-07, + "loss": 0.8418, + "step": 10802 + }, + { + "epoch": 4.897098821396193, + "grad_norm": 0.07277283167882766, + "learning_rate": 1.0317123488413406e-07, + "loss": 0.8447, + "step": 10803 + }, + { + "epoch": 4.897552130553037, + "grad_norm": 0.07327557804979581, + "learning_rate": 1.0226462608922571e-07, + "loss": 0.858, + "step": 10804 + }, + { + "epoch": 4.898005439709882, + "grad_norm": 0.07242870192026421, + "learning_rate": 1.0136201316977012e-07, + "loss": 0.8268, + "step": 10805 + }, + { + "epoch": 4.898458748866727, + "grad_norm": 0.0784881962964586, + "learning_rate": 1.0046339621617052e-07, + "loss": 0.8613, + "step": 10806 + }, + { + "epoch": 4.898912058023572, + "grad_norm": 0.07705861982587607, + "learning_rate": 9.956877531842158e-08, + "loss": 0.8498, + "step": 10807 + }, + { + "epoch": 4.899365367180417, + "grad_norm": 0.07384306251632183, + "learning_rate": 9.867815056612717e-08, + "loss": 0.8398, + "step": 10808 + }, + { + "epoch": 4.8998186763372615, + "grad_norm": 0.07046839918475759, + "learning_rate": 9.779152204848264e-08, + "loss": 0.8458, + "step": 10809 + }, + { + "epoch": 4.900271985494107, + "grad_norm": 0.07804020987396698, + "learning_rate": 9.690888985428804e-08, + "loss": 0.8574, + "step": 10810 + }, + { + "epoch": 4.900725294650952, + "grad_norm": 0.07365520682994448, + "learning_rate": 9.603025407193933e-08, + "loss": 0.8635, + "step": 10811 + }, + { + "epoch": 4.9011786038077965, + "grad_norm": 0.07173201530823223, + "learning_rate": 9.515561478943725e-08, + "loss": 0.8354, + "step": 10812 + }, + { + "epoch": 4.901631912964642, + "grad_norm": 0.07037525154721833, + "learning_rate": 9.428497209438281e-08, + "loss": 0.8586, + "step": 10813 + }, + { + "epoch": 4.902085222121487, + "grad_norm": 0.07144999016652055, + "learning_rate": 9.341832607396406e-08, + "loss": 0.8573, + "step": 10814 + }, + { + "epoch": 4.902538531278331, + "grad_norm": 0.07210140936399108, + "learning_rate": 9.255567681498268e-08, + "loss": 0.8747, + "step": 10815 + }, + { + "epoch": 4.902991840435177, + "grad_norm": 0.08906644127521304, + "learning_rate": 9.16970244038362e-08, + "loss": 0.8466, + "step": 10816 + }, + { + "epoch": 4.903445149592022, + "grad_norm": 0.07313125833432993, + "learning_rate": 9.084236892652697e-08, + "loss": 0.8472, + "step": 10817 + }, + { + "epoch": 4.903898458748866, + "grad_norm": 0.0766043924200107, + "learning_rate": 8.999171046863986e-08, + "loss": 0.8686, + "step": 10818 + }, + { + "epoch": 4.904351767905712, + "grad_norm": 0.07017618147199202, + "learning_rate": 8.91450491153778e-08, + "loss": 0.8419, + "step": 10819 + }, + { + "epoch": 4.904805077062557, + "grad_norm": 0.07070123938127158, + "learning_rate": 8.830238495153521e-08, + "loss": 0.844, + "step": 10820 + }, + { + "epoch": 4.905258386219401, + "grad_norm": 0.07627637633554185, + "learning_rate": 8.746371806150677e-08, + "loss": 0.8448, + "step": 10821 + }, + { + "epoch": 4.9057116953762465, + "grad_norm": 0.07488811152597905, + "learning_rate": 8.662904852928755e-08, + "loss": 0.8701, + "step": 10822 + }, + { + "epoch": 4.906165004533092, + "grad_norm": 0.07998272970800134, + "learning_rate": 8.579837643847289e-08, + "loss": 0.8693, + "step": 10823 + }, + { + "epoch": 4.906618313689936, + "grad_norm": 0.07744334892342782, + "learning_rate": 8.4971701872254e-08, + "loss": 0.8542, + "step": 10824 + }, + { + "epoch": 4.9070716228467814, + "grad_norm": 0.0758454182209573, + "learning_rate": 8.414902491343136e-08, + "loss": 0.8601, + "step": 10825 + }, + { + "epoch": 4.907524932003627, + "grad_norm": 0.07838678787294966, + "learning_rate": 8.333034564439236e-08, + "loss": 0.8658, + "step": 10826 + }, + { + "epoch": 4.907978241160471, + "grad_norm": 0.10820303237801611, + "learning_rate": 8.25156641471292e-08, + "loss": 0.8618, + "step": 10827 + }, + { + "epoch": 4.908431550317316, + "grad_norm": 0.07204971391322237, + "learning_rate": 8.170498050324327e-08, + "loss": 0.8552, + "step": 10828 + }, + { + "epoch": 4.908884859474162, + "grad_norm": 0.06845406577688544, + "learning_rate": 8.089829479391852e-08, + "loss": 0.8432, + "step": 10829 + }, + { + "epoch": 4.909338168631006, + "grad_norm": 0.07258328587785126, + "learning_rate": 8.009560709994812e-08, + "loss": 0.8511, + "step": 10830 + }, + { + "epoch": 4.909791477787851, + "grad_norm": 0.07111702746532299, + "learning_rate": 7.929691750172553e-08, + "loss": 0.8506, + "step": 10831 + }, + { + "epoch": 4.910244786944697, + "grad_norm": 0.07834831839978451, + "learning_rate": 7.850222607924452e-08, + "loss": 0.8576, + "step": 10832 + }, + { + "epoch": 4.910698096101541, + "grad_norm": 0.07095864957168942, + "learning_rate": 7.771153291209033e-08, + "loss": 0.8607, + "step": 10833 + }, + { + "epoch": 4.911151405258386, + "grad_norm": 0.07348475439115128, + "learning_rate": 7.692483807945739e-08, + "loss": 0.8425, + "step": 10834 + }, + { + "epoch": 4.9116047144152315, + "grad_norm": 0.07103893574851348, + "learning_rate": 7.614214166013156e-08, + "loss": 0.8378, + "step": 10835 + }, + { + "epoch": 4.912058023572076, + "grad_norm": 0.070858337746872, + "learning_rate": 7.536344373250348e-08, + "loss": 0.8583, + "step": 10836 + }, + { + "epoch": 4.912511332728921, + "grad_norm": 0.0754664923150713, + "learning_rate": 7.458874437456409e-08, + "loss": 0.8729, + "step": 10837 + }, + { + "epoch": 4.912964641885766, + "grad_norm": 0.07241461988415891, + "learning_rate": 7.381804366390022e-08, + "loss": 0.8636, + "step": 10838 + }, + { + "epoch": 4.913417951042611, + "grad_norm": 0.07030853136247872, + "learning_rate": 7.305134167770344e-08, + "loss": 0.8582, + "step": 10839 + }, + { + "epoch": 4.913871260199456, + "grad_norm": 0.07212520710036109, + "learning_rate": 7.228863849275235e-08, + "loss": 0.8383, + "step": 10840 + }, + { + "epoch": 4.914324569356301, + "grad_norm": 0.0739839591924685, + "learning_rate": 7.152993418544363e-08, + "loss": 0.8513, + "step": 10841 + }, + { + "epoch": 4.914777878513146, + "grad_norm": 0.07897066354767326, + "learning_rate": 7.077522883175647e-08, + "loss": 0.8604, + "step": 10842 + }, + { + "epoch": 4.915231187669991, + "grad_norm": 0.07086820123032714, + "learning_rate": 7.002452250728375e-08, + "loss": 0.8576, + "step": 10843 + }, + { + "epoch": 4.915684496826836, + "grad_norm": 0.06975599212651659, + "learning_rate": 6.927781528720534e-08, + "loss": 0.8406, + "step": 10844 + }, + { + "epoch": 4.916137805983681, + "grad_norm": 0.07191673270486008, + "learning_rate": 6.853510724630585e-08, + "loss": 0.8486, + "step": 10845 + }, + { + "epoch": 4.916591115140526, + "grad_norm": 0.07287307781157328, + "learning_rate": 6.779639845897467e-08, + "loss": 0.8527, + "step": 10846 + }, + { + "epoch": 4.917044424297371, + "grad_norm": 0.07200035406526217, + "learning_rate": 6.706168899919264e-08, + "loss": 0.8529, + "step": 10847 + }, + { + "epoch": 4.917497733454216, + "grad_norm": 0.07542311936253747, + "learning_rate": 6.633097894054086e-08, + "loss": 0.8646, + "step": 10848 + }, + { + "epoch": 4.917951042611061, + "grad_norm": 0.06694144309288673, + "learning_rate": 6.560426835620526e-08, + "loss": 0.8479, + "step": 10849 + }, + { + "epoch": 4.918404351767906, + "grad_norm": 0.07831617526823789, + "learning_rate": 6.488155731896761e-08, + "loss": 0.8626, + "step": 10850 + }, + { + "epoch": 4.9188576609247505, + "grad_norm": 0.0691587715013022, + "learning_rate": 6.416284590121002e-08, + "loss": 0.876, + "step": 10851 + }, + { + "epoch": 4.919310970081596, + "grad_norm": 0.0711611846418456, + "learning_rate": 6.344813417491491e-08, + "loss": 0.8477, + "step": 10852 + }, + { + "epoch": 4.919764279238441, + "grad_norm": 0.06986667055899655, + "learning_rate": 6.273742221165613e-08, + "loss": 0.8536, + "step": 10853 + }, + { + "epoch": 4.9202175883952854, + "grad_norm": 0.06929084131851548, + "learning_rate": 6.203071008261674e-08, + "loss": 0.842, + "step": 10854 + }, + { + "epoch": 4.920670897552131, + "grad_norm": 0.07148714362746861, + "learning_rate": 6.132799785858012e-08, + "loss": 0.8674, + "step": 10855 + }, + { + "epoch": 4.921124206708976, + "grad_norm": 0.08399171410669189, + "learning_rate": 6.062928560992554e-08, + "loss": 0.8638, + "step": 10856 + }, + { + "epoch": 4.92157751586582, + "grad_norm": 0.075344659074703, + "learning_rate": 5.993457340662367e-08, + "loss": 0.8549, + "step": 10857 + }, + { + "epoch": 4.922030825022666, + "grad_norm": 0.07443982956436951, + "learning_rate": 5.9243861318254435e-08, + "loss": 0.859, + "step": 10858 + }, + { + "epoch": 4.922484134179511, + "grad_norm": 0.07199097046057348, + "learning_rate": 5.855714941399804e-08, + "loss": 0.8586, + "step": 10859 + }, + { + "epoch": 4.922937443336355, + "grad_norm": 0.06900854237462915, + "learning_rate": 5.787443776263058e-08, + "loss": 0.8549, + "step": 10860 + }, + { + "epoch": 4.923390752493201, + "grad_norm": 0.06808001265965444, + "learning_rate": 5.7195726432524024e-08, + "loss": 0.845, + "step": 10861 + }, + { + "epoch": 4.923844061650045, + "grad_norm": 0.0701287537381092, + "learning_rate": 5.652101549165512e-08, + "loss": 0.8618, + "step": 10862 + }, + { + "epoch": 4.92429737080689, + "grad_norm": 0.07092596361305217, + "learning_rate": 5.585030500759647e-08, + "loss": 0.843, + "step": 10863 + }, + { + "epoch": 4.9247506799637355, + "grad_norm": 0.072320935714157, + "learning_rate": 5.518359504752546e-08, + "loss": 0.8475, + "step": 10864 + }, + { + "epoch": 4.92520398912058, + "grad_norm": 0.08003384409891365, + "learning_rate": 5.452088567821534e-08, + "loss": 0.8568, + "step": 10865 + }, + { + "epoch": 4.925657298277425, + "grad_norm": 0.07939673617603105, + "learning_rate": 5.38621769660308e-08, + "loss": 0.8651, + "step": 10866 + }, + { + "epoch": 4.92611060743427, + "grad_norm": 0.06940372659913556, + "learning_rate": 5.320746897695461e-08, + "loss": 0.8525, + "step": 10867 + }, + { + "epoch": 4.926563916591115, + "grad_norm": 0.06984473302130734, + "learning_rate": 5.255676177654767e-08, + "loss": 0.831, + "step": 10868 + }, + { + "epoch": 4.92701722574796, + "grad_norm": 0.07317351624538221, + "learning_rate": 5.191005542998895e-08, + "loss": 0.84, + "step": 10869 + }, + { + "epoch": 4.927470534904805, + "grad_norm": 0.07008862901470622, + "learning_rate": 5.126735000203553e-08, + "loss": 0.8471, + "step": 10870 + }, + { + "epoch": 4.92792384406165, + "grad_norm": 0.07591477116568622, + "learning_rate": 5.062864555707148e-08, + "loss": 0.8523, + "step": 10871 + }, + { + "epoch": 4.928377153218495, + "grad_norm": 0.07275357302762979, + "learning_rate": 4.999394215905451e-08, + "loss": 0.8376, + "step": 10872 + }, + { + "epoch": 4.92883046237534, + "grad_norm": 0.0736676238331784, + "learning_rate": 4.9363239871556004e-08, + "loss": 0.8606, + "step": 10873 + }, + { + "epoch": 4.929283771532185, + "grad_norm": 0.073526509806377, + "learning_rate": 4.873653875773876e-08, + "loss": 0.8418, + "step": 10874 + }, + { + "epoch": 4.92973708068903, + "grad_norm": 0.07272413327009557, + "learning_rate": 4.8113838880374796e-08, + "loss": 0.8514, + "step": 10875 + }, + { + "epoch": 4.930190389845875, + "grad_norm": 0.06914376963492898, + "learning_rate": 4.749514030182756e-08, + "loss": 0.8393, + "step": 10876 + }, + { + "epoch": 4.93064369900272, + "grad_norm": 0.06818478268072875, + "learning_rate": 4.688044308406081e-08, + "loss": 0.8495, + "step": 10877 + }, + { + "epoch": 4.931097008159565, + "grad_norm": 0.07466355211550574, + "learning_rate": 4.6269747288634205e-08, + "loss": 0.8426, + "step": 10878 + }, + { + "epoch": 4.93155031731641, + "grad_norm": 0.07704846456672652, + "learning_rate": 4.5663052976716583e-08, + "loss": 0.8515, + "step": 10879 + }, + { + "epoch": 4.9320036264732545, + "grad_norm": 0.06918626936930124, + "learning_rate": 4.506036020906379e-08, + "loss": 0.8681, + "step": 10880 + }, + { + "epoch": 4.9324569356301, + "grad_norm": 0.07971039200814443, + "learning_rate": 4.446166904604532e-08, + "loss": 0.8567, + "step": 10881 + }, + { + "epoch": 4.932910244786945, + "grad_norm": 0.06890065204788386, + "learning_rate": 4.386697954761765e-08, + "loss": 0.8561, + "step": 10882 + }, + { + "epoch": 4.9333635539437894, + "grad_norm": 0.07081104404828469, + "learning_rate": 4.327629177333759e-08, + "loss": 0.8666, + "step": 10883 + }, + { + "epoch": 4.933816863100635, + "grad_norm": 0.0706927532457192, + "learning_rate": 4.268960578237114e-08, + "loss": 0.8545, + "step": 10884 + }, + { + "epoch": 4.934270172257479, + "grad_norm": 0.07983683435035693, + "learning_rate": 4.21069216334713e-08, + "loss": 0.8567, + "step": 10885 + }, + { + "epoch": 4.934723481414324, + "grad_norm": 0.0700104068286981, + "learning_rate": 4.152823938500028e-08, + "loss": 0.8574, + "step": 10886 + }, + { + "epoch": 4.93517679057117, + "grad_norm": 0.07045846830926496, + "learning_rate": 4.095355909491172e-08, + "loss": 0.8496, + "step": 10887 + }, + { + "epoch": 4.935630099728014, + "grad_norm": 0.07041497703979514, + "learning_rate": 4.0382880820759586e-08, + "loss": 0.8509, + "step": 10888 + }, + { + "epoch": 4.936083408884859, + "grad_norm": 0.07179957747327485, + "learning_rate": 3.981620461969815e-08, + "loss": 0.858, + "step": 10889 + }, + { + "epoch": 4.936536718041705, + "grad_norm": 0.0681967141162593, + "learning_rate": 3.925353054848646e-08, + "loss": 0.8514, + "step": 10890 + }, + { + "epoch": 4.936990027198549, + "grad_norm": 0.07094936838651296, + "learning_rate": 3.869485866347944e-08, + "loss": 0.8557, + "step": 10891 + }, + { + "epoch": 4.937443336355394, + "grad_norm": 0.07295521184686665, + "learning_rate": 3.814018902062344e-08, + "loss": 0.852, + "step": 10892 + }, + { + "epoch": 4.9378966455122395, + "grad_norm": 0.0670310043136111, + "learning_rate": 3.7589521675474025e-08, + "loss": 0.8441, + "step": 10893 + }, + { + "epoch": 4.938349954669084, + "grad_norm": 0.0665121188771586, + "learning_rate": 3.704285668318264e-08, + "loss": 0.8313, + "step": 10894 + }, + { + "epoch": 4.938803263825929, + "grad_norm": 0.06865024858938337, + "learning_rate": 3.650019409849659e-08, + "loss": 0.8521, + "step": 10895 + }, + { + "epoch": 4.939256572982774, + "grad_norm": 0.07375538425349819, + "learning_rate": 3.596153397576796e-08, + "loss": 0.8425, + "step": 10896 + }, + { + "epoch": 4.939709882139619, + "grad_norm": 0.07197844176995212, + "learning_rate": 3.5426876368940265e-08, + "loss": 0.864, + "step": 10897 + }, + { + "epoch": 4.940163191296464, + "grad_norm": 0.07236009985357937, + "learning_rate": 3.489622133156623e-08, + "loss": 0.8401, + "step": 10898 + }, + { + "epoch": 4.940616500453309, + "grad_norm": 0.07162462238172404, + "learning_rate": 3.436956891679444e-08, + "loss": 0.8841, + "step": 10899 + }, + { + "epoch": 4.941069809610154, + "grad_norm": 0.07378940352550453, + "learning_rate": 3.3846919177364936e-08, + "loss": 0.8398, + "step": 10900 + }, + { + "epoch": 4.941523118766999, + "grad_norm": 0.07333982672461968, + "learning_rate": 3.332827216562251e-08, + "loss": 0.8788, + "step": 10901 + }, + { + "epoch": 4.941976427923844, + "grad_norm": 0.06797946747112323, + "learning_rate": 3.281362793351672e-08, + "loss": 0.8386, + "step": 10902 + }, + { + "epoch": 4.942429737080689, + "grad_norm": 0.07036765357216754, + "learning_rate": 3.2302986532584125e-08, + "loss": 0.8549, + "step": 10903 + }, + { + "epoch": 4.942883046237534, + "grad_norm": 0.07211749992116465, + "learning_rate": 3.1796348013974906e-08, + "loss": 0.8607, + "step": 10904 + }, + { + "epoch": 4.943336355394379, + "grad_norm": 0.0717130568504481, + "learning_rate": 3.129371242842183e-08, + "loss": 0.8612, + "step": 10905 + }, + { + "epoch": 4.943789664551224, + "grad_norm": 0.06582284992730286, + "learning_rate": 3.079507982627128e-08, + "loss": 0.8505, + "step": 10906 + }, + { + "epoch": 4.944242973708069, + "grad_norm": 0.07075824988871308, + "learning_rate": 3.030045025746109e-08, + "loss": 0.8507, + "step": 10907 + }, + { + "epoch": 4.944696282864914, + "grad_norm": 0.0722252625284295, + "learning_rate": 2.980982377152941e-08, + "loss": 0.8643, + "step": 10908 + }, + { + "epoch": 4.9451495920217585, + "grad_norm": 0.07366007736370704, + "learning_rate": 2.9323200417610276e-08, + "loss": 0.8633, + "step": 10909 + }, + { + "epoch": 4.945602901178604, + "grad_norm": 0.06943183314422842, + "learning_rate": 2.8840580244451355e-08, + "loss": 0.8441, + "step": 10910 + }, + { + "epoch": 4.946056210335449, + "grad_norm": 0.07100499589990819, + "learning_rate": 2.8361963300373994e-08, + "loss": 0.842, + "step": 10911 + }, + { + "epoch": 4.9465095194922934, + "grad_norm": 0.06842459578266459, + "learning_rate": 2.788734963332651e-08, + "loss": 0.8316, + "step": 10912 + }, + { + "epoch": 4.946962828649139, + "grad_norm": 0.07117544776874243, + "learning_rate": 2.7416739290830886e-08, + "loss": 0.8501, + "step": 10913 + }, + { + "epoch": 4.947416137805984, + "grad_norm": 0.07107969385623414, + "learning_rate": 2.6950132320031632e-08, + "loss": 0.8609, + "step": 10914 + }, + { + "epoch": 4.947869446962828, + "grad_norm": 0.0718865489541239, + "learning_rate": 2.6487528767651372e-08, + "loss": 0.8566, + "step": 10915 + }, + { + "epoch": 4.948322756119674, + "grad_norm": 0.07072636502864371, + "learning_rate": 2.6028928680026377e-08, + "loss": 0.8431, + "step": 10916 + }, + { + "epoch": 4.948776065276519, + "grad_norm": 0.06800997273592006, + "learning_rate": 2.5574332103088795e-08, + "loss": 0.8629, + "step": 10917 + }, + { + "epoch": 4.949229374433363, + "grad_norm": 0.07331771282624823, + "learning_rate": 2.5123739082362208e-08, + "loss": 0.851, + "step": 10918 + }, + { + "epoch": 4.949682683590209, + "grad_norm": 0.06763363770242352, + "learning_rate": 2.4677149662974965e-08, + "loss": 0.8403, + "step": 10919 + }, + { + "epoch": 4.950135992747054, + "grad_norm": 0.07756585958177022, + "learning_rate": 2.4234563889660166e-08, + "loss": 0.8456, + "step": 10920 + }, + { + "epoch": 4.950589301903898, + "grad_norm": 0.06847554091979027, + "learning_rate": 2.3795981806737923e-08, + "loss": 0.8501, + "step": 10921 + }, + { + "epoch": 4.9510426110607435, + "grad_norm": 0.06837190642142807, + "learning_rate": 2.336140345813309e-08, + "loss": 0.8555, + "step": 10922 + }, + { + "epoch": 4.951495920217589, + "grad_norm": 0.07195215206288685, + "learning_rate": 2.293082888737974e-08, + "loss": 0.8383, + "step": 10923 + }, + { + "epoch": 4.951949229374433, + "grad_norm": 0.06859014555321469, + "learning_rate": 2.250425813759005e-08, + "loss": 0.8557, + "step": 10924 + }, + { + "epoch": 4.952402538531278, + "grad_norm": 0.07011853890939518, + "learning_rate": 2.2081691251489845e-08, + "loss": 0.8666, + "step": 10925 + }, + { + "epoch": 4.952855847688124, + "grad_norm": 0.07138397597076719, + "learning_rate": 2.1663128271405266e-08, + "loss": 0.85, + "step": 10926 + }, + { + "epoch": 4.953309156844968, + "grad_norm": 0.07472103648273275, + "learning_rate": 2.124856923924945e-08, + "loss": 0.8778, + "step": 10927 + }, + { + "epoch": 4.953762466001813, + "grad_norm": 0.06907065592535915, + "learning_rate": 2.0838014196544743e-08, + "loss": 0.8543, + "step": 10928 + }, + { + "epoch": 4.954215775158659, + "grad_norm": 0.07327934623537491, + "learning_rate": 2.043146318440936e-08, + "loss": 0.8462, + "step": 10929 + }, + { + "epoch": 4.954669084315503, + "grad_norm": 0.07765686186871218, + "learning_rate": 2.0028916243561846e-08, + "loss": 0.8523, + "step": 10930 + }, + { + "epoch": 4.955122393472348, + "grad_norm": 0.07203794858977176, + "learning_rate": 1.9630373414316618e-08, + "loss": 0.8326, + "step": 10931 + }, + { + "epoch": 4.955575702629194, + "grad_norm": 0.07295911793550625, + "learning_rate": 1.923583473658841e-08, + "loss": 0.8566, + "step": 10932 + }, + { + "epoch": 4.956029011786038, + "grad_norm": 0.07594818434069098, + "learning_rate": 1.8845300249896726e-08, + "loss": 0.8637, + "step": 10933 + }, + { + "epoch": 4.956482320942883, + "grad_norm": 0.0692817521828546, + "learning_rate": 1.8458769993348058e-08, + "loss": 0.8728, + "step": 10934 + }, + { + "epoch": 4.9569356300997285, + "grad_norm": 0.07027617881138326, + "learning_rate": 1.8076244005653665e-08, + "loss": 0.8453, + "step": 10935 + }, + { + "epoch": 4.957388939256573, + "grad_norm": 0.06725870818846198, + "learning_rate": 1.7697722325134005e-08, + "loss": 0.8617, + "step": 10936 + }, + { + "epoch": 4.957842248413418, + "grad_norm": 0.07308098529831712, + "learning_rate": 1.7323204989692087e-08, + "loss": 0.8519, + "step": 10937 + }, + { + "epoch": 4.958295557570263, + "grad_norm": 0.07170143626176319, + "learning_rate": 1.6952692036835693e-08, + "loss": 0.8438, + "step": 10938 + }, + { + "epoch": 4.958748866727108, + "grad_norm": 0.17071528537600203, + "learning_rate": 1.6586183503677355e-08, + "loss": 0.8664, + "step": 10939 + }, + { + "epoch": 4.959202175883953, + "grad_norm": 0.07315612368093548, + "learning_rate": 1.6223679426921047e-08, + "loss": 0.8521, + "step": 10940 + }, + { + "epoch": 4.9596554850407975, + "grad_norm": 0.0745230507624495, + "learning_rate": 1.586517984287106e-08, + "loss": 0.8435, + "step": 10941 + }, + { + "epoch": 4.960108794197643, + "grad_norm": 0.06821473240109936, + "learning_rate": 1.5510684787436448e-08, + "loss": 0.8694, + "step": 10942 + }, + { + "epoch": 4.960562103354488, + "grad_norm": 0.07303275620453534, + "learning_rate": 1.5160194296117703e-08, + "loss": 0.8536, + "step": 10943 + }, + { + "epoch": 4.961015412511332, + "grad_norm": 0.06823166050286568, + "learning_rate": 1.4813708404024519e-08, + "loss": 0.8431, + "step": 10944 + }, + { + "epoch": 4.961468721668178, + "grad_norm": 0.07044496689429201, + "learning_rate": 1.4471227145844701e-08, + "loss": 0.8561, + "step": 10945 + }, + { + "epoch": 4.961922030825023, + "grad_norm": 0.0733905097223399, + "learning_rate": 1.4132750555893027e-08, + "loss": 0.8572, + "step": 10946 + }, + { + "epoch": 4.962375339981867, + "grad_norm": 0.07516949279206996, + "learning_rate": 1.3798278668057941e-08, + "loss": 0.875, + "step": 10947 + }, + { + "epoch": 4.962828649138713, + "grad_norm": 0.07472774415340931, + "learning_rate": 1.3467811515845974e-08, + "loss": 0.8427, + "step": 10948 + }, + { + "epoch": 4.963281958295558, + "grad_norm": 0.073696942089791, + "learning_rate": 1.3141349132350655e-08, + "loss": 0.8493, + "step": 10949 + }, + { + "epoch": 4.963735267452402, + "grad_norm": 0.07752289909920977, + "learning_rate": 1.2818891550265833e-08, + "loss": 0.8777, + "step": 10950 + }, + { + "epoch": 4.9641885766092475, + "grad_norm": 0.0697600272617098, + "learning_rate": 1.2500438801890113e-08, + "loss": 0.8477, + "step": 10951 + }, + { + "epoch": 4.964641885766093, + "grad_norm": 0.06884816045468711, + "learning_rate": 1.2185990919117985e-08, + "loss": 0.8512, + "step": 10952 + }, + { + "epoch": 4.965095194922937, + "grad_norm": 0.07364041066962744, + "learning_rate": 1.1875547933439813e-08, + "loss": 0.8775, + "step": 10953 + }, + { + "epoch": 4.965548504079782, + "grad_norm": 0.07283935367029004, + "learning_rate": 1.1569109875950724e-08, + "loss": 0.8497, + "step": 10954 + }, + { + "epoch": 4.966001813236628, + "grad_norm": 0.07089980181035943, + "learning_rate": 1.1266676777337282e-08, + "loss": 0.8426, + "step": 10955 + }, + { + "epoch": 4.966455122393472, + "grad_norm": 0.06720426714028835, + "learning_rate": 1.0968248667890813e-08, + "loss": 0.8491, + "step": 10956 + }, + { + "epoch": 4.966908431550317, + "grad_norm": 0.07032717443374295, + "learning_rate": 1.067382557749852e-08, + "loss": 0.8602, + "step": 10957 + }, + { + "epoch": 4.967361740707163, + "grad_norm": 0.07093863768127648, + "learning_rate": 1.0383407535652368e-08, + "loss": 0.8434, + "step": 10958 + }, + { + "epoch": 4.967815049864007, + "grad_norm": 0.07185390669365396, + "learning_rate": 1.0096994571431318e-08, + "loss": 0.8767, + "step": 10959 + }, + { + "epoch": 4.968268359020852, + "grad_norm": 0.06824684950363652, + "learning_rate": 9.814586713527974e-09, + "loss": 0.8545, + "step": 10960 + }, + { + "epoch": 4.968721668177698, + "grad_norm": 0.07306374426788341, + "learning_rate": 9.536183990217495e-09, + "loss": 0.8556, + "step": 10961 + }, + { + "epoch": 4.969174977334542, + "grad_norm": 0.06767379309027823, + "learning_rate": 9.261786429393127e-09, + "loss": 0.8371, + "step": 10962 + }, + { + "epoch": 4.969628286491387, + "grad_norm": 0.07024495702540379, + "learning_rate": 8.991394058530667e-09, + "loss": 0.8655, + "step": 10963 + }, + { + "epoch": 4.970081595648232, + "grad_norm": 0.07080115717606093, + "learning_rate": 8.725006904710675e-09, + "loss": 0.8669, + "step": 10964 + }, + { + "epoch": 4.970534904805077, + "grad_norm": 0.06987029294926282, + "learning_rate": 8.462624994614032e-09, + "loss": 0.8661, + "step": 10965 + }, + { + "epoch": 4.970988213961922, + "grad_norm": 0.07154209857697337, + "learning_rate": 8.204248354517497e-09, + "loss": 0.8345, + "step": 10966 + }, + { + "epoch": 4.9714415231187665, + "grad_norm": 0.06749566971971352, + "learning_rate": 7.949877010298146e-09, + "loss": 0.8478, + "step": 10967 + }, + { + "epoch": 4.971894832275612, + "grad_norm": 0.07833982170790801, + "learning_rate": 7.699510987437819e-09, + "loss": 0.8646, + "step": 10968 + }, + { + "epoch": 4.972348141432457, + "grad_norm": 0.06907471954970962, + "learning_rate": 7.453150311000912e-09, + "loss": 0.8364, + "step": 10969 + }, + { + "epoch": 4.9728014505893015, + "grad_norm": 0.06815883979922442, + "learning_rate": 7.210795005669901e-09, + "loss": 0.8468, + "step": 10970 + }, + { + "epoch": 4.973254759746147, + "grad_norm": 0.06946795100494477, + "learning_rate": 6.972445095714264e-09, + "loss": 0.8352, + "step": 10971 + }, + { + "epoch": 4.973708068902992, + "grad_norm": 0.07312348230538714, + "learning_rate": 6.7381006050037945e-09, + "loss": 0.8452, + "step": 10972 + }, + { + "epoch": 4.974161378059836, + "grad_norm": 0.06935052081417592, + "learning_rate": 6.5077615570130485e-09, + "loss": 0.8602, + "step": 10973 + }, + { + "epoch": 4.974614687216682, + "grad_norm": 0.07415610472792299, + "learning_rate": 6.2814279748080185e-09, + "loss": 0.8412, + "step": 10974 + }, + { + "epoch": 4.975067996373527, + "grad_norm": 0.07141189585797755, + "learning_rate": 6.05909988105502e-09, + "loss": 0.8693, + "step": 10975 + }, + { + "epoch": 4.975521305530371, + "grad_norm": 0.07438683450144334, + "learning_rate": 5.840777298025125e-09, + "loss": 0.8505, + "step": 10976 + }, + { + "epoch": 4.975974614687217, + "grad_norm": 0.06875372457519163, + "learning_rate": 5.626460247585286e-09, + "loss": 0.8602, + "step": 10977 + }, + { + "epoch": 4.976427923844062, + "grad_norm": 0.0705144078254744, + "learning_rate": 5.416148751193895e-09, + "loss": 0.8396, + "step": 10978 + }, + { + "epoch": 4.976881233000906, + "grad_norm": 0.07090595807844496, + "learning_rate": 5.209842829914102e-09, + "loss": 0.8572, + "step": 10979 + }, + { + "epoch": 4.9773345421577515, + "grad_norm": 0.06924647407922745, + "learning_rate": 5.007542504413821e-09, + "loss": 0.8193, + "step": 10980 + }, + { + "epoch": 4.977787851314597, + "grad_norm": 0.06866013116550396, + "learning_rate": 4.809247794952399e-09, + "loss": 0.8641, + "step": 10981 + }, + { + "epoch": 4.978241160471441, + "grad_norm": 0.07376646206100122, + "learning_rate": 4.614958721385065e-09, + "loss": 0.8394, + "step": 10982 + }, + { + "epoch": 4.978694469628286, + "grad_norm": 0.07077341360153261, + "learning_rate": 4.424675303176251e-09, + "loss": 0.8434, + "step": 10983 + }, + { + "epoch": 4.979147778785132, + "grad_norm": 0.06844292264727711, + "learning_rate": 4.2383975593818236e-09, + "loss": 0.8437, + "step": 10984 + }, + { + "epoch": 4.979601087941976, + "grad_norm": 0.06840698928578334, + "learning_rate": 4.056125508657971e-09, + "loss": 0.8363, + "step": 10985 + }, + { + "epoch": 4.980054397098821, + "grad_norm": 0.07465238335063269, + "learning_rate": 3.877859169256759e-09, + "loss": 0.8665, + "step": 10986 + }, + { + "epoch": 4.980507706255667, + "grad_norm": 0.06846263025309236, + "learning_rate": 3.7035985590350153e-09, + "loss": 0.847, + "step": 10987 + }, + { + "epoch": 4.980961015412511, + "grad_norm": 0.07079274498097517, + "learning_rate": 3.533343695445446e-09, + "loss": 0.8719, + "step": 10988 + }, + { + "epoch": 4.981414324569356, + "grad_norm": 0.071621426260187, + "learning_rate": 3.367094595536635e-09, + "loss": 0.8647, + "step": 10989 + }, + { + "epoch": 4.981867633726202, + "grad_norm": 0.06854323455846983, + "learning_rate": 3.204851275961929e-09, + "loss": 0.8532, + "step": 10990 + }, + { + "epoch": 4.982320942883046, + "grad_norm": 0.0682164133095486, + "learning_rate": 3.046613752970551e-09, + "loss": 0.8625, + "step": 10991 + }, + { + "epoch": 4.982774252039891, + "grad_norm": 0.06743744821191022, + "learning_rate": 2.8923820424120453e-09, + "loss": 0.8548, + "step": 10992 + }, + { + "epoch": 4.9832275611967365, + "grad_norm": 0.07142289102109214, + "learning_rate": 2.7421561597273938e-09, + "loss": 0.8641, + "step": 10993 + }, + { + "epoch": 4.983680870353581, + "grad_norm": 0.06626468793761803, + "learning_rate": 2.5959361199667797e-09, + "loss": 0.8551, + "step": 10994 + }, + { + "epoch": 4.984134179510426, + "grad_norm": 0.07394875704569635, + "learning_rate": 2.4537219377718245e-09, + "loss": 0.858, + "step": 10995 + }, + { + "epoch": 4.984587488667271, + "grad_norm": 0.07018766103106938, + "learning_rate": 2.31551362738891e-09, + "loss": 0.8672, + "step": 10996 + }, + { + "epoch": 4.985040797824116, + "grad_norm": 0.07388543527871509, + "learning_rate": 2.1813112026602966e-09, + "loss": 0.8606, + "step": 10997 + }, + { + "epoch": 4.985494106980961, + "grad_norm": 0.07071371721099061, + "learning_rate": 2.051114677024124e-09, + "loss": 0.866, + "step": 10998 + }, + { + "epoch": 4.985947416137806, + "grad_norm": 0.06817219011943554, + "learning_rate": 1.9249240635188515e-09, + "loss": 0.8579, + "step": 10999 + }, + { + "epoch": 4.986400725294651, + "grad_norm": 0.06689515259173237, + "learning_rate": 1.8027393747832577e-09, + "loss": 0.8541, + "step": 11000 + }, + { + "epoch": 4.986854034451496, + "grad_norm": 0.06772621166173957, + "learning_rate": 1.6845606230564416e-09, + "loss": 0.8495, + "step": 11001 + }, + { + "epoch": 4.987307343608341, + "grad_norm": 0.07363016046676793, + "learning_rate": 1.570387820177821e-09, + "loss": 0.8628, + "step": 11002 + }, + { + "epoch": 4.987760652765186, + "grad_norm": 0.07420404298964406, + "learning_rate": 1.4602209775738118e-09, + "loss": 0.8604, + "step": 11003 + }, + { + "epoch": 4.988213961922031, + "grad_norm": 0.06855230755391765, + "learning_rate": 1.3540601062844716e-09, + "loss": 0.8578, + "step": 11004 + }, + { + "epoch": 4.988667271078876, + "grad_norm": 0.06594972831314524, + "learning_rate": 1.2519052169368551e-09, + "loss": 0.8464, + "step": 11005 + }, + { + "epoch": 4.989120580235721, + "grad_norm": 0.07016782703040234, + "learning_rate": 1.1537563197672187e-09, + "loss": 0.8585, + "step": 11006 + }, + { + "epoch": 4.989573889392566, + "grad_norm": 0.06870143757708885, + "learning_rate": 1.0596134246032563e-09, + "loss": 0.8694, + "step": 11007 + }, + { + "epoch": 4.990027198549411, + "grad_norm": 0.06945659551505583, + "learning_rate": 9.694765408729822e-10, + "loss": 0.8476, + "step": 11008 + }, + { + "epoch": 4.9904805077062555, + "grad_norm": 0.07085237075514067, + "learning_rate": 8.8334567760473e-10, + "loss": 0.8513, + "step": 11009 + }, + { + "epoch": 4.990933816863101, + "grad_norm": 0.06947831498854283, + "learning_rate": 8.012208434271529e-10, + "loss": 0.8513, + "step": 11010 + }, + { + "epoch": 4.991387126019946, + "grad_norm": 0.07448731948227764, + "learning_rate": 7.231020465603422e-10, + "loss": 0.8416, + "step": 11011 + }, + { + "epoch": 4.99184043517679, + "grad_norm": 0.07653630064208565, + "learning_rate": 6.489892948291499e-10, + "loss": 0.8645, + "step": 11012 + }, + { + "epoch": 4.992293744333636, + "grad_norm": 0.07147019257161563, + "learning_rate": 5.788825956587474e-10, + "loss": 0.8493, + "step": 11013 + }, + { + "epoch": 4.992747053490481, + "grad_norm": 0.07342770399716086, + "learning_rate": 5.127819560701852e-10, + "loss": 0.857, + "step": 11014 + }, + { + "epoch": 4.993200362647325, + "grad_norm": 0.06857083419157299, + "learning_rate": 4.5068738268483345e-10, + "loss": 0.8531, + "step": 11015 + }, + { + "epoch": 4.993653671804171, + "grad_norm": 0.06764336104981886, + "learning_rate": 3.925988817155002e-10, + "loss": 0.867, + "step": 11016 + }, + { + "epoch": 4.994106980961016, + "grad_norm": 0.07232417431685507, + "learning_rate": 3.385164589886358e-10, + "loss": 0.8549, + "step": 11017 + }, + { + "epoch": 4.99456029011786, + "grad_norm": 0.07451797142429244, + "learning_rate": 2.8844011991324696e-10, + "loss": 0.8657, + "step": 11018 + }, + { + "epoch": 4.995013599274706, + "grad_norm": 0.06923607058604082, + "learning_rate": 2.423698695075416e-10, + "loss": 0.839, + "step": 11019 + }, + { + "epoch": 4.99546690843155, + "grad_norm": 0.06838955725282134, + "learning_rate": 2.003057123856067e-10, + "loss": 0.8797, + "step": 11020 + }, + { + "epoch": 4.995920217588395, + "grad_norm": 0.07507104471899696, + "learning_rate": 1.6224765276184885e-10, + "loss": 0.864, + "step": 11021 + }, + { + "epoch": 4.9963735267452405, + "grad_norm": 0.07052376584620196, + "learning_rate": 1.2819569444655344e-10, + "loss": 0.8659, + "step": 11022 + }, + { + "epoch": 4.996826835902085, + "grad_norm": 0.07306984353468565, + "learning_rate": 9.814984084588475e-11, + "loss": 0.8671, + "step": 11023 + }, + { + "epoch": 4.99728014505893, + "grad_norm": 0.06788362638713523, + "learning_rate": 7.211009497520849e-11, + "loss": 0.8671, + "step": 11024 + }, + { + "epoch": 4.997733454215775, + "grad_norm": 0.06853298362723625, + "learning_rate": 5.007645944132833e-11, + "loss": 0.8466, + "step": 11025 + }, + { + "epoch": 4.99818676337262, + "grad_norm": 0.07474201003510449, + "learning_rate": 3.204893645136764e-11, + "loss": 0.8577, + "step": 11026 + }, + { + "epoch": 4.998640072529465, + "grad_norm": 0.06812490779696442, + "learning_rate": 1.8027527808328616e-11, + "loss": 0.866, + "step": 11027 + }, + { + "epoch": 4.99909338168631, + "grad_norm": 0.06921788922338136, + "learning_rate": 8.012234915533157e-12, + "loss": 0.8519, + "step": 11028 + }, + { + "epoch": 4.999546690843155, + "grad_norm": 0.07168051716379939, + "learning_rate": 2.0030587810637713e-12, + "loss": 0.8543, + "step": 11029 + }, + { + "epoch": 5.0, + "grad_norm": 0.07249081317989815, + "learning_rate": 0.0, + "loss": 0.8464, + "step": 11030 + }, + { + "epoch": 5.0, + "step": 11030, + "total_flos": 1.8504084910768128e+17, + "train_loss": 0.0797391855489743, + "train_runtime": 13114.3231, + "train_samples_per_second": 430.543, + "train_steps_per_second": 0.841 + } + ], + "logging_steps": 1, + "max_steps": 11030, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.8504084910768128e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}