diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5047 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.991735537190083, + "eval_steps": 500, + "global_step": 715, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.006959547629404089, + "grad_norm": 6.299945556626471, + "learning_rate": 1.111111111111111e-06, + "loss": 0.8825, + "step": 1 + }, + { + "epoch": 0.013919095258808177, + "grad_norm": 6.257876699073014, + "learning_rate": 2.222222222222222e-06, + "loss": 0.8704, + "step": 2 + }, + { + "epoch": 0.020878642888212267, + "grad_norm": 6.160373581422307, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.8713, + "step": 3 + }, + { + "epoch": 0.027838190517616355, + "grad_norm": 5.78775826291232, + "learning_rate": 4.444444444444444e-06, + "loss": 0.8598, + "step": 4 + }, + { + "epoch": 0.034797738147020446, + "grad_norm": 4.434647119841161, + "learning_rate": 5.555555555555557e-06, + "loss": 0.8172, + "step": 5 + }, + { + "epoch": 0.041757285776424534, + "grad_norm": 2.3263779022698095, + "learning_rate": 6.666666666666667e-06, + "loss": 0.7532, + "step": 6 + }, + { + "epoch": 0.04871683340582862, + "grad_norm": 4.051177439739189, + "learning_rate": 7.77777777777778e-06, + "loss": 0.7557, + "step": 7 + }, + { + "epoch": 0.05567638103523271, + "grad_norm": 4.175202353129295, + "learning_rate": 8.888888888888888e-06, + "loss": 0.7655, + "step": 8 + }, + { + "epoch": 0.0626359286646368, + "grad_norm": 3.8871866859374617, + "learning_rate": 1e-05, + "loss": 0.7274, + "step": 9 + }, + { + "epoch": 0.06959547629404089, + "grad_norm": 4.009248099328964, + "learning_rate": 1.1111111111111113e-05, + "loss": 0.6947, + "step": 10 + }, + { + "epoch": 0.07655502392344497, + "grad_norm": 3.2381642347145796, + "learning_rate": 1.2222222222222224e-05, + "loss": 0.6795, + "step": 11 + }, + { + "epoch": 0.08351457155284907, + "grad_norm": 2.0504476986085827, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.6602, + "step": 12 + }, + { + "epoch": 0.09047411918225315, + "grad_norm": 2.384280645275452, + "learning_rate": 1.4444444444444446e-05, + "loss": 0.6394, + "step": 13 + }, + { + "epoch": 0.09743366681165724, + "grad_norm": 2.419324966834746, + "learning_rate": 1.555555555555556e-05, + "loss": 0.6282, + "step": 14 + }, + { + "epoch": 0.10439321444106132, + "grad_norm": 1.4468314839239673, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.6092, + "step": 15 + }, + { + "epoch": 0.11135276207046542, + "grad_norm": 1.1678600409520985, + "learning_rate": 1.7777777777777777e-05, + "loss": 0.5894, + "step": 16 + }, + { + "epoch": 0.11831230969986951, + "grad_norm": 1.264682985827519, + "learning_rate": 1.888888888888889e-05, + "loss": 0.5968, + "step": 17 + }, + { + "epoch": 0.1252718573292736, + "grad_norm": 0.6754725868857698, + "learning_rate": 2e-05, + "loss": 0.576, + "step": 18 + }, + { + "epoch": 0.1322314049586777, + "grad_norm": 0.9074270406226251, + "learning_rate": 2.1111111111111114e-05, + "loss": 0.5698, + "step": 19 + }, + { + "epoch": 0.13919095258808178, + "grad_norm": 0.7979533790293932, + "learning_rate": 2.2222222222222227e-05, + "loss": 0.5588, + "step": 20 + }, + { + "epoch": 0.14615050021748585, + "grad_norm": 0.6581363594356142, + "learning_rate": 2.3333333333333336e-05, + "loss": 0.5606, + "step": 21 + }, + { + "epoch": 0.15311004784688995, + "grad_norm": 0.6691967855765695, + "learning_rate": 2.444444444444445e-05, + "loss": 0.5481, + "step": 22 + }, + { + "epoch": 0.16006959547629404, + "grad_norm": 0.542444666355929, + "learning_rate": 2.5555555555555554e-05, + "loss": 0.5539, + "step": 23 + }, + { + "epoch": 0.16702914310569814, + "grad_norm": 0.6416430096126567, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.5501, + "step": 24 + }, + { + "epoch": 0.17398869073510223, + "grad_norm": 0.5329567057613817, + "learning_rate": 2.777777777777778e-05, + "loss": 0.5342, + "step": 25 + }, + { + "epoch": 0.1809482383645063, + "grad_norm": 0.6011450434974139, + "learning_rate": 2.888888888888889e-05, + "loss": 0.5348, + "step": 26 + }, + { + "epoch": 0.1879077859939104, + "grad_norm": 0.4976703306853586, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.5322, + "step": 27 + }, + { + "epoch": 0.1948673336233145, + "grad_norm": 0.5730627506660203, + "learning_rate": 3.111111111111112e-05, + "loss": 0.5213, + "step": 28 + }, + { + "epoch": 0.20182688125271858, + "grad_norm": 0.7301409032698557, + "learning_rate": 3.222222222222223e-05, + "loss": 0.5206, + "step": 29 + }, + { + "epoch": 0.20878642888212265, + "grad_norm": 1.4025503659857947, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.5322, + "step": 30 + }, + { + "epoch": 0.21574597651152674, + "grad_norm": 0.8305463760946818, + "learning_rate": 3.444444444444445e-05, + "loss": 0.5176, + "step": 31 + }, + { + "epoch": 0.22270552414093084, + "grad_norm": 0.8468215550610021, + "learning_rate": 3.555555555555555e-05, + "loss": 0.5187, + "step": 32 + }, + { + "epoch": 0.22966507177033493, + "grad_norm": 0.8897899781711042, + "learning_rate": 3.6666666666666666e-05, + "loss": 0.5206, + "step": 33 + }, + { + "epoch": 0.23662461939973903, + "grad_norm": 1.1213311013190945, + "learning_rate": 3.777777777777778e-05, + "loss": 0.5035, + "step": 34 + }, + { + "epoch": 0.2435841670291431, + "grad_norm": 1.1010420447489897, + "learning_rate": 3.888888888888889e-05, + "loss": 0.5072, + "step": 35 + }, + { + "epoch": 0.2505437146585472, + "grad_norm": 0.773476718518657, + "learning_rate": 4e-05, + "loss": 0.5077, + "step": 36 + }, + { + "epoch": 0.2575032622879513, + "grad_norm": 1.2400452716206256, + "learning_rate": 4.111111111111111e-05, + "loss": 0.5033, + "step": 37 + }, + { + "epoch": 0.2644628099173554, + "grad_norm": 0.9153450607625541, + "learning_rate": 4.222222222222223e-05, + "loss": 0.5, + "step": 38 + }, + { + "epoch": 0.2714223575467595, + "grad_norm": 0.6514251195810624, + "learning_rate": 4.3333333333333334e-05, + "loss": 0.4947, + "step": 39 + }, + { + "epoch": 0.27838190517616357, + "grad_norm": 1.0042919223967974, + "learning_rate": 4.444444444444445e-05, + "loss": 0.5042, + "step": 40 + }, + { + "epoch": 0.28534145280556766, + "grad_norm": 1.0797096325295303, + "learning_rate": 4.555555555555556e-05, + "loss": 0.4916, + "step": 41 + }, + { + "epoch": 0.2923010004349717, + "grad_norm": 0.9905878602993525, + "learning_rate": 4.666666666666667e-05, + "loss": 0.496, + "step": 42 + }, + { + "epoch": 0.2992605480643758, + "grad_norm": 0.8676969083941743, + "learning_rate": 4.777777777777778e-05, + "loss": 0.4885, + "step": 43 + }, + { + "epoch": 0.3062200956937799, + "grad_norm": 0.7260235825278305, + "learning_rate": 4.88888888888889e-05, + "loss": 0.5005, + "step": 44 + }, + { + "epoch": 0.313179643323184, + "grad_norm": 0.9223906328687149, + "learning_rate": 5e-05, + "loss": 0.4892, + "step": 45 + }, + { + "epoch": 0.3201391909525881, + "grad_norm": 1.3832086370219072, + "learning_rate": 5.111111111111111e-05, + "loss": 0.4997, + "step": 46 + }, + { + "epoch": 0.3270987385819922, + "grad_norm": 0.8419161235221501, + "learning_rate": 5.222222222222223e-05, + "loss": 0.4938, + "step": 47 + }, + { + "epoch": 0.33405828621139627, + "grad_norm": 1.6195340381200483, + "learning_rate": 5.333333333333333e-05, + "loss": 0.4968, + "step": 48 + }, + { + "epoch": 0.34101783384080037, + "grad_norm": 0.9275367275665349, + "learning_rate": 5.444444444444445e-05, + "loss": 0.5006, + "step": 49 + }, + { + "epoch": 0.34797738147020446, + "grad_norm": 1.7543919560909402, + "learning_rate": 5.555555555555556e-05, + "loss": 0.5007, + "step": 50 + }, + { + "epoch": 0.3549369290996085, + "grad_norm": 1.26313567421099, + "learning_rate": 5.666666666666668e-05, + "loss": 0.499, + "step": 51 + }, + { + "epoch": 0.3618964767290126, + "grad_norm": 1.8899821239654098, + "learning_rate": 5.777777777777778e-05, + "loss": 0.4868, + "step": 52 + }, + { + "epoch": 0.3688560243584167, + "grad_norm": 1.4836204192299145, + "learning_rate": 5.8888888888888896e-05, + "loss": 0.4924, + "step": 53 + }, + { + "epoch": 0.3758155719878208, + "grad_norm": 1.5371932375940351, + "learning_rate": 6.000000000000001e-05, + "loss": 0.4853, + "step": 54 + }, + { + "epoch": 0.3827751196172249, + "grad_norm": 1.066878304885815, + "learning_rate": 6.111111111111111e-05, + "loss": 0.4823, + "step": 55 + }, + { + "epoch": 0.389734667246629, + "grad_norm": 1.234430905173555, + "learning_rate": 6.222222222222223e-05, + "loss": 0.4848, + "step": 56 + }, + { + "epoch": 0.39669421487603307, + "grad_norm": 1.0923409666404706, + "learning_rate": 6.333333333333333e-05, + "loss": 0.494, + "step": 57 + }, + { + "epoch": 0.40365376250543716, + "grad_norm": 0.9800617899091041, + "learning_rate": 6.444444444444446e-05, + "loss": 0.4825, + "step": 58 + }, + { + "epoch": 0.41061331013484126, + "grad_norm": 0.9212766482645198, + "learning_rate": 6.555555555555556e-05, + "loss": 0.4691, + "step": 59 + }, + { + "epoch": 0.4175728577642453, + "grad_norm": 1.167155227826628, + "learning_rate": 6.666666666666667e-05, + "loss": 0.4815, + "step": 60 + }, + { + "epoch": 0.4245324053936494, + "grad_norm": 1.5791917226157102, + "learning_rate": 6.777777777777778e-05, + "loss": 0.4943, + "step": 61 + }, + { + "epoch": 0.4314919530230535, + "grad_norm": 1.0997180103791135, + "learning_rate": 6.88888888888889e-05, + "loss": 0.4871, + "step": 62 + }, + { + "epoch": 0.4384515006524576, + "grad_norm": 1.2130023175807059, + "learning_rate": 7.000000000000001e-05, + "loss": 0.4855, + "step": 63 + }, + { + "epoch": 0.4454110482818617, + "grad_norm": 1.6270954136094906, + "learning_rate": 7.11111111111111e-05, + "loss": 0.4877, + "step": 64 + }, + { + "epoch": 0.45237059591126577, + "grad_norm": 1.1304632174516827, + "learning_rate": 7.222222222222223e-05, + "loss": 0.4795, + "step": 65 + }, + { + "epoch": 0.45933014354066987, + "grad_norm": 1.32786525260077, + "learning_rate": 7.333333333333333e-05, + "loss": 0.4815, + "step": 66 + }, + { + "epoch": 0.46628969117007396, + "grad_norm": 0.6938586247846547, + "learning_rate": 7.444444444444446e-05, + "loss": 0.4711, + "step": 67 + }, + { + "epoch": 0.47324923879947806, + "grad_norm": 1.3238232457797845, + "learning_rate": 7.555555555555556e-05, + "loss": 0.4823, + "step": 68 + }, + { + "epoch": 0.4802087864288821, + "grad_norm": 0.7731792383221893, + "learning_rate": 7.666666666666668e-05, + "loss": 0.4769, + "step": 69 + }, + { + "epoch": 0.4871683340582862, + "grad_norm": 0.714435674612326, + "learning_rate": 7.777777777777778e-05, + "loss": 0.47, + "step": 70 + }, + { + "epoch": 0.4941278816876903, + "grad_norm": 0.727161798048739, + "learning_rate": 7.88888888888889e-05, + "loss": 0.4748, + "step": 71 + }, + { + "epoch": 0.5010874293170944, + "grad_norm": 0.7822239107856425, + "learning_rate": 8e-05, + "loss": 0.4735, + "step": 72 + }, + { + "epoch": 0.5080469769464985, + "grad_norm": 0.9159063781364695, + "learning_rate": 7.999952257304926e-05, + "loss": 0.4585, + "step": 73 + }, + { + "epoch": 0.5150065245759026, + "grad_norm": 1.4014617788300159, + "learning_rate": 7.99980903035939e-05, + "loss": 0.4817, + "step": 74 + }, + { + "epoch": 0.5219660722053067, + "grad_norm": 0.9697910698942601, + "learning_rate": 7.999570322582408e-05, + "loss": 0.4719, + "step": 75 + }, + { + "epoch": 0.5289256198347108, + "grad_norm": 1.2780959714818068, + "learning_rate": 7.99923613967226e-05, + "loss": 0.4744, + "step": 76 + }, + { + "epoch": 0.5358851674641149, + "grad_norm": 0.9675381526583897, + "learning_rate": 7.99880648960634e-05, + "loss": 0.4704, + "step": 77 + }, + { + "epoch": 0.542844715093519, + "grad_norm": 1.047833737067459, + "learning_rate": 7.998281382640975e-05, + "loss": 0.4654, + "step": 78 + }, + { + "epoch": 0.549804262722923, + "grad_norm": 1.2845937442452149, + "learning_rate": 7.997660831311176e-05, + "loss": 0.475, + "step": 79 + }, + { + "epoch": 0.5567638103523271, + "grad_norm": 0.8772171829670746, + "learning_rate": 7.996944850430339e-05, + "loss": 0.4656, + "step": 80 + }, + { + "epoch": 0.5637233579817312, + "grad_norm": 0.741967780268622, + "learning_rate": 7.996133457089894e-05, + "loss": 0.4575, + "step": 81 + }, + { + "epoch": 0.5706829056111353, + "grad_norm": 0.8708734610216243, + "learning_rate": 7.99522667065889e-05, + "loss": 0.4673, + "step": 82 + }, + { + "epoch": 0.5776424532405393, + "grad_norm": 0.9611160209126256, + "learning_rate": 7.994224512783544e-05, + "loss": 0.4644, + "step": 83 + }, + { + "epoch": 0.5846020008699434, + "grad_norm": 1.2059285045807202, + "learning_rate": 7.993127007386715e-05, + "loss": 0.4782, + "step": 84 + }, + { + "epoch": 0.5915615484993475, + "grad_norm": 1.0796995800628297, + "learning_rate": 7.991934180667333e-05, + "loss": 0.4642, + "step": 85 + }, + { + "epoch": 0.5985210961287516, + "grad_norm": 1.0316521924490913, + "learning_rate": 7.990646061099782e-05, + "loss": 0.4646, + "step": 86 + }, + { + "epoch": 0.6054806437581557, + "grad_norm": 0.8832150277973638, + "learning_rate": 7.989262679433211e-05, + "loss": 0.4626, + "step": 87 + }, + { + "epoch": 0.6124401913875598, + "grad_norm": 0.7634910217249218, + "learning_rate": 7.987784068690804e-05, + "loss": 0.4626, + "step": 88 + }, + { + "epoch": 0.6193997390169639, + "grad_norm": 1.1086418661133017, + "learning_rate": 7.986210264168991e-05, + "loss": 0.4521, + "step": 89 + }, + { + "epoch": 0.626359286646368, + "grad_norm": 0.6778528235443292, + "learning_rate": 7.98454130343661e-05, + "loss": 0.4606, + "step": 90 + }, + { + "epoch": 0.6333188342757721, + "grad_norm": 0.7098255147206154, + "learning_rate": 7.982777226334e-05, + "loss": 0.4546, + "step": 91 + }, + { + "epoch": 0.6402783819051762, + "grad_norm": 0.7512375219693761, + "learning_rate": 7.980918074972059e-05, + "loss": 0.4526, + "step": 92 + }, + { + "epoch": 0.6472379295345803, + "grad_norm": 0.4955536043933238, + "learning_rate": 7.978963893731235e-05, + "loss": 0.4514, + "step": 93 + }, + { + "epoch": 0.6541974771639844, + "grad_norm": 0.6854584128464718, + "learning_rate": 7.976914729260468e-05, + "loss": 0.4656, + "step": 94 + }, + { + "epoch": 0.6611570247933884, + "grad_norm": 0.6020857806767794, + "learning_rate": 7.974770630476077e-05, + "loss": 0.4539, + "step": 95 + }, + { + "epoch": 0.6681165724227925, + "grad_norm": 0.5198959190719997, + "learning_rate": 7.972531648560587e-05, + "loss": 0.4522, + "step": 96 + }, + { + "epoch": 0.6750761200521966, + "grad_norm": 0.8318026218834386, + "learning_rate": 7.970197836961513e-05, + "loss": 0.4623, + "step": 97 + }, + { + "epoch": 0.6820356676816007, + "grad_norm": 0.9109802442285713, + "learning_rate": 7.967769251390083e-05, + "loss": 0.4559, + "step": 98 + }, + { + "epoch": 0.6889952153110048, + "grad_norm": 1.25243965937425, + "learning_rate": 7.96524594981991e-05, + "loss": 0.4626, + "step": 99 + }, + { + "epoch": 0.6959547629404089, + "grad_norm": 0.868516955234305, + "learning_rate": 7.9626279924856e-05, + "loss": 0.4569, + "step": 100 + }, + { + "epoch": 0.702914310569813, + "grad_norm": 0.48520097270425405, + "learning_rate": 7.959915441881322e-05, + "loss": 0.4515, + "step": 101 + }, + { + "epoch": 0.709873858199217, + "grad_norm": 0.5480517510335293, + "learning_rate": 7.957108362759316e-05, + "loss": 0.4544, + "step": 102 + }, + { + "epoch": 0.7168334058286211, + "grad_norm": 0.8911184240263139, + "learning_rate": 7.954206822128343e-05, + "loss": 0.4635, + "step": 103 + }, + { + "epoch": 0.7237929534580252, + "grad_norm": 0.8227526938281489, + "learning_rate": 7.951210889252088e-05, + "loss": 0.465, + "step": 104 + }, + { + "epoch": 0.7307525010874293, + "grad_norm": 0.5558210684070918, + "learning_rate": 7.948120635647503e-05, + "loss": 0.4487, + "step": 105 + }, + { + "epoch": 0.7377120487168334, + "grad_norm": 0.6355162909760532, + "learning_rate": 7.944936135083108e-05, + "loss": 0.4523, + "step": 106 + }, + { + "epoch": 0.7446715963462375, + "grad_norm": 0.6105345680130448, + "learning_rate": 7.941657463577225e-05, + "loss": 0.4575, + "step": 107 + }, + { + "epoch": 0.7516311439756416, + "grad_norm": 0.5678069745935661, + "learning_rate": 7.938284699396157e-05, + "loss": 0.4498, + "step": 108 + }, + { + "epoch": 0.7585906916050457, + "grad_norm": 0.5483024912339128, + "learning_rate": 7.934817923052331e-05, + "loss": 0.4549, + "step": 109 + }, + { + "epoch": 0.7655502392344498, + "grad_norm": 0.3929806004224007, + "learning_rate": 7.931257217302371e-05, + "loss": 0.4504, + "step": 110 + }, + { + "epoch": 0.7725097868638539, + "grad_norm": 0.5681787692060095, + "learning_rate": 7.927602667145121e-05, + "loss": 0.4477, + "step": 111 + }, + { + "epoch": 0.779469334493258, + "grad_norm": 0.556711524840673, + "learning_rate": 7.923854359819619e-05, + "loss": 0.4484, + "step": 112 + }, + { + "epoch": 0.786428882122662, + "grad_norm": 0.4138699309021785, + "learning_rate": 7.92001238480301e-05, + "loss": 0.447, + "step": 113 + }, + { + "epoch": 0.7933884297520661, + "grad_norm": 0.6357342110964699, + "learning_rate": 7.916076833808414e-05, + "loss": 0.4513, + "step": 114 + }, + { + "epoch": 0.8003479773814702, + "grad_norm": 0.8584704922958183, + "learning_rate": 7.91204780078274e-05, + "loss": 0.4427, + "step": 115 + }, + { + "epoch": 0.8073075250108743, + "grad_norm": 0.9871565259991888, + "learning_rate": 7.907925381904432e-05, + "loss": 0.4554, + "step": 116 + }, + { + "epoch": 0.8142670726402784, + "grad_norm": 1.0217097625637481, + "learning_rate": 7.903709675581185e-05, + "loss": 0.453, + "step": 117 + }, + { + "epoch": 0.8212266202696825, + "grad_norm": 0.7895770598500398, + "learning_rate": 7.899400782447591e-05, + "loss": 0.4541, + "step": 118 + }, + { + "epoch": 0.8281861678990866, + "grad_norm": 0.5874040536712771, + "learning_rate": 7.894998805362737e-05, + "loss": 0.4423, + "step": 119 + }, + { + "epoch": 0.8351457155284906, + "grad_norm": 0.6690541560849889, + "learning_rate": 7.890503849407742e-05, + "loss": 0.4519, + "step": 120 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.6865319922768905, + "learning_rate": 7.885916021883268e-05, + "loss": 0.4455, + "step": 121 + }, + { + "epoch": 0.8490648107872988, + "grad_norm": 0.5200700598023363, + "learning_rate": 7.881235432306936e-05, + "loss": 0.4407, + "step": 122 + }, + { + "epoch": 0.8560243584167029, + "grad_norm": 0.4344010301700919, + "learning_rate": 7.876462192410727e-05, + "loss": 0.4465, + "step": 123 + }, + { + "epoch": 0.862983906046107, + "grad_norm": 0.6677648559236202, + "learning_rate": 7.871596416138312e-05, + "loss": 0.4497, + "step": 124 + }, + { + "epoch": 0.8699434536755111, + "grad_norm": 0.4959133192072294, + "learning_rate": 7.866638219642324e-05, + "loss": 0.4412, + "step": 125 + }, + { + "epoch": 0.8769030013049152, + "grad_norm": 0.3302086455708142, + "learning_rate": 7.861587721281598e-05, + "loss": 0.4341, + "step": 126 + }, + { + "epoch": 0.8838625489343193, + "grad_norm": 0.4127575574993749, + "learning_rate": 7.856445041618333e-05, + "loss": 0.4403, + "step": 127 + }, + { + "epoch": 0.8908220965637234, + "grad_norm": 0.4079306482235732, + "learning_rate": 7.851210303415225e-05, + "loss": 0.45, + "step": 128 + }, + { + "epoch": 0.8977816441931274, + "grad_norm": 0.3629235803836879, + "learning_rate": 7.845883631632527e-05, + "loss": 0.4371, + "step": 129 + }, + { + "epoch": 0.9047411918225315, + "grad_norm": 0.3281071553183999, + "learning_rate": 7.840465153425074e-05, + "loss": 0.4342, + "step": 130 + }, + { + "epoch": 0.9117007394519356, + "grad_norm": 0.3939672640729078, + "learning_rate": 7.83495499813924e-05, + "loss": 0.4393, + "step": 131 + }, + { + "epoch": 0.9186602870813397, + "grad_norm": 0.4669777553758287, + "learning_rate": 7.829353297309857e-05, + "loss": 0.4378, + "step": 132 + }, + { + "epoch": 0.9256198347107438, + "grad_norm": 0.5191309992385434, + "learning_rate": 7.823660184657075e-05, + "loss": 0.4419, + "step": 133 + }, + { + "epoch": 0.9325793823401479, + "grad_norm": 0.4371861252468821, + "learning_rate": 7.817875796083164e-05, + "loss": 0.4442, + "step": 134 + }, + { + "epoch": 0.939538929969552, + "grad_norm": 0.6968595662983466, + "learning_rate": 7.812000269669271e-05, + "loss": 0.4448, + "step": 135 + }, + { + "epoch": 0.9464984775989561, + "grad_norm": 0.9915151230783144, + "learning_rate": 7.806033745672132e-05, + "loss": 0.4459, + "step": 136 + }, + { + "epoch": 0.9534580252283602, + "grad_norm": 1.1614516148599872, + "learning_rate": 7.799976366520714e-05, + "loss": 0.4458, + "step": 137 + }, + { + "epoch": 0.9604175728577642, + "grad_norm": 0.7103915477953243, + "learning_rate": 7.793828276812819e-05, + "loss": 0.4413, + "step": 138 + }, + { + "epoch": 0.9673771204871683, + "grad_norm": 0.6607811748107033, + "learning_rate": 7.787589623311635e-05, + "loss": 0.4374, + "step": 139 + }, + { + "epoch": 0.9743366681165724, + "grad_norm": 0.669847981333871, + "learning_rate": 7.781260554942226e-05, + "loss": 0.4452, + "step": 140 + }, + { + "epoch": 0.9812962157459765, + "grad_norm": 0.47755885581992924, + "learning_rate": 7.774841222787983e-05, + "loss": 0.4439, + "step": 141 + }, + { + "epoch": 0.9882557633753806, + "grad_norm": 0.4723668370103968, + "learning_rate": 7.768331780087017e-05, + "loss": 0.4462, + "step": 142 + }, + { + "epoch": 0.9952153110047847, + "grad_norm": 0.5073519260421897, + "learning_rate": 7.761732382228494e-05, + "loss": 0.4406, + "step": 143 + }, + { + "epoch": 1.0060896041757286, + "grad_norm": 0.4347621708894711, + "learning_rate": 7.755043186748936e-05, + "loss": 0.4218, + "step": 144 + }, + { + "epoch": 1.0130491518051328, + "grad_norm": 0.4797475573338107, + "learning_rate": 7.748264353328451e-05, + "loss": 0.4078, + "step": 145 + }, + { + "epoch": 1.0200086994345368, + "grad_norm": 0.5279297958588075, + "learning_rate": 7.741396043786929e-05, + "loss": 0.4191, + "step": 146 + }, + { + "epoch": 1.0269682470639407, + "grad_norm": 0.5909819052892469, + "learning_rate": 7.734438422080174e-05, + "loss": 0.4168, + "step": 147 + }, + { + "epoch": 1.033927794693345, + "grad_norm": 0.6495659004492463, + "learning_rate": 7.727391654295991e-05, + "loss": 0.4194, + "step": 148 + }, + { + "epoch": 1.040887342322749, + "grad_norm": 0.587326465562309, + "learning_rate": 7.720255908650222e-05, + "loss": 0.4212, + "step": 149 + }, + { + "epoch": 1.0478468899521531, + "grad_norm": 0.45607673329416626, + "learning_rate": 7.713031355482734e-05, + "loss": 0.4074, + "step": 150 + }, + { + "epoch": 1.0548064375815571, + "grad_norm": 0.3531037620398402, + "learning_rate": 7.705718167253345e-05, + "loss": 0.4136, + "step": 151 + }, + { + "epoch": 1.0617659852109613, + "grad_norm": 0.4297476754808921, + "learning_rate": 7.698316518537713e-05, + "loss": 0.417, + "step": 152 + }, + { + "epoch": 1.0687255328403653, + "grad_norm": 0.6072156953171759, + "learning_rate": 7.690826586023165e-05, + "loss": 0.4163, + "step": 153 + }, + { + "epoch": 1.0756850804697695, + "grad_norm": 0.7760172331652697, + "learning_rate": 7.683248548504486e-05, + "loss": 0.4159, + "step": 154 + }, + { + "epoch": 1.0826446280991735, + "grad_norm": 0.876708332241165, + "learning_rate": 7.675582586879641e-05, + "loss": 0.4192, + "step": 155 + }, + { + "epoch": 1.0896041757285777, + "grad_norm": 0.9107175938318952, + "learning_rate": 7.667828884145465e-05, + "loss": 0.4145, + "step": 156 + }, + { + "epoch": 1.0965637233579817, + "grad_norm": 0.8653660027489845, + "learning_rate": 7.65998762539329e-05, + "loss": 0.4218, + "step": 157 + }, + { + "epoch": 1.103523270987386, + "grad_norm": 0.7850041910149821, + "learning_rate": 7.652058997804532e-05, + "loss": 0.4192, + "step": 158 + }, + { + "epoch": 1.1104828186167899, + "grad_norm": 0.5313734478672961, + "learning_rate": 7.644043190646211e-05, + "loss": 0.4118, + "step": 159 + }, + { + "epoch": 1.117442366246194, + "grad_norm": 0.5056894621317538, + "learning_rate": 7.63594039526645e-05, + "loss": 0.4143, + "step": 160 + }, + { + "epoch": 1.124401913875598, + "grad_norm": 0.766156288599041, + "learning_rate": 7.627750805089888e-05, + "loss": 0.4202, + "step": 161 + }, + { + "epoch": 1.1313614615050023, + "grad_norm": 0.7173262253497903, + "learning_rate": 7.619474615613083e-05, + "loss": 0.4085, + "step": 162 + }, + { + "epoch": 1.1383210091344063, + "grad_norm": 0.3801147393208917, + "learning_rate": 7.611112024399829e-05, + "loss": 0.4098, + "step": 163 + }, + { + "epoch": 1.1452805567638102, + "grad_norm": 0.459213719396466, + "learning_rate": 7.602663231076445e-05, + "loss": 0.4215, + "step": 164 + }, + { + "epoch": 1.1522401043932144, + "grad_norm": 0.5419991575798397, + "learning_rate": 7.594128437327017e-05, + "loss": 0.4154, + "step": 165 + }, + { + "epoch": 1.1591996520226187, + "grad_norm": 0.4710423907727317, + "learning_rate": 7.58550784688857e-05, + "loss": 0.4102, + "step": 166 + }, + { + "epoch": 1.1661591996520226, + "grad_norm": 0.3269293538096053, + "learning_rate": 7.576801665546214e-05, + "loss": 0.4183, + "step": 167 + }, + { + "epoch": 1.1731187472814266, + "grad_norm": 0.354719558769914, + "learning_rate": 7.568010101128229e-05, + "loss": 0.4083, + "step": 168 + }, + { + "epoch": 1.1800782949108308, + "grad_norm": 0.41454295254538065, + "learning_rate": 7.559133363501107e-05, + "loss": 0.4073, + "step": 169 + }, + { + "epoch": 1.1870378425402348, + "grad_norm": 0.41584140556124005, + "learning_rate": 7.550171664564537e-05, + "loss": 0.4184, + "step": 170 + }, + { + "epoch": 1.193997390169639, + "grad_norm": 0.42301367871950313, + "learning_rate": 7.541125218246346e-05, + "loss": 0.4129, + "step": 171 + }, + { + "epoch": 1.200956937799043, + "grad_norm": 0.28966866299551014, + "learning_rate": 7.531994240497399e-05, + "loss": 0.4078, + "step": 172 + }, + { + "epoch": 1.2079164854284472, + "grad_norm": 0.27574788938129696, + "learning_rate": 7.52277894928644e-05, + "loss": 0.4122, + "step": 173 + }, + { + "epoch": 1.2148760330578512, + "grad_norm": 0.3134277334731802, + "learning_rate": 7.513479564594888e-05, + "loss": 0.4105, + "step": 174 + }, + { + "epoch": 1.2218355806872554, + "grad_norm": 0.31141216371808733, + "learning_rate": 7.504096308411587e-05, + "loss": 0.4101, + "step": 175 + }, + { + "epoch": 1.2287951283166594, + "grad_norm": 0.309174676739755, + "learning_rate": 7.494629404727506e-05, + "loss": 0.4099, + "step": 176 + }, + { + "epoch": 1.2357546759460636, + "grad_norm": 0.3804013842457314, + "learning_rate": 7.485079079530393e-05, + "loss": 0.4065, + "step": 177 + }, + { + "epoch": 1.2427142235754676, + "grad_norm": 0.4490200434000277, + "learning_rate": 7.47544556079938e-05, + "loss": 0.4178, + "step": 178 + }, + { + "epoch": 1.2496737712048718, + "grad_norm": 0.5056109077740427, + "learning_rate": 7.465729078499541e-05, + "loss": 0.4175, + "step": 179 + }, + { + "epoch": 1.2566333188342758, + "grad_norm": 0.5728814822805163, + "learning_rate": 7.455929864576402e-05, + "loss": 0.4003, + "step": 180 + }, + { + "epoch": 1.26359286646368, + "grad_norm": 0.5364301922656747, + "learning_rate": 7.4460481529504e-05, + "loss": 0.4126, + "step": 181 + }, + { + "epoch": 1.270552414093084, + "grad_norm": 0.48400372508161793, + "learning_rate": 7.436084179511315e-05, + "loss": 0.4111, + "step": 182 + }, + { + "epoch": 1.277511961722488, + "grad_norm": 0.48691704971428823, + "learning_rate": 7.426038182112613e-05, + "loss": 0.4192, + "step": 183 + }, + { + "epoch": 1.2844715093518921, + "grad_norm": 0.4326376870735781, + "learning_rate": 7.415910400565795e-05, + "loss": 0.4071, + "step": 184 + }, + { + "epoch": 1.2914310569812963, + "grad_norm": 0.3854589354965239, + "learning_rate": 7.405701076634649e-05, + "loss": 0.4132, + "step": 185 + }, + { + "epoch": 1.2983906046107003, + "grad_norm": 0.3589751690112129, + "learning_rate": 7.395410454029498e-05, + "loss": 0.4141, + "step": 186 + }, + { + "epoch": 1.3053501522401043, + "grad_norm": 0.3798549380531171, + "learning_rate": 7.385038778401367e-05, + "loss": 0.4109, + "step": 187 + }, + { + "epoch": 1.3123096998695085, + "grad_norm": 0.5063041641106548, + "learning_rate": 7.374586297336134e-05, + "loss": 0.4121, + "step": 188 + }, + { + "epoch": 1.3192692474989125, + "grad_norm": 0.5499801181881762, + "learning_rate": 7.364053260348603e-05, + "loss": 0.4131, + "step": 189 + }, + { + "epoch": 1.3262287951283167, + "grad_norm": 0.538473535319422, + "learning_rate": 7.353439918876565e-05, + "loss": 0.4146, + "step": 190 + }, + { + "epoch": 1.3331883427577207, + "grad_norm": 0.5133206548316543, + "learning_rate": 7.342746526274779e-05, + "loss": 0.41, + "step": 191 + }, + { + "epoch": 1.340147890387125, + "grad_norm": 0.4474460367465246, + "learning_rate": 7.331973337808937e-05, + "loss": 0.4122, + "step": 192 + }, + { + "epoch": 1.3471074380165289, + "grad_norm": 0.44569022437909495, + "learning_rate": 7.321120610649567e-05, + "loss": 0.408, + "step": 193 + }, + { + "epoch": 1.354066985645933, + "grad_norm": 0.4721240876904583, + "learning_rate": 7.310188603865888e-05, + "loss": 0.4091, + "step": 194 + }, + { + "epoch": 1.361026533275337, + "grad_norm": 0.4670666827700711, + "learning_rate": 7.299177578419634e-05, + "loss": 0.4092, + "step": 195 + }, + { + "epoch": 1.3679860809047413, + "grad_norm": 0.3702037932288681, + "learning_rate": 7.288087797158822e-05, + "loss": 0.4097, + "step": 196 + }, + { + "epoch": 1.3749456285341453, + "grad_norm": 0.330387714721557, + "learning_rate": 7.276919524811472e-05, + "loss": 0.4104, + "step": 197 + }, + { + "epoch": 1.3819051761635492, + "grad_norm": 0.3875230383248599, + "learning_rate": 7.265673027979295e-05, + "loss": 0.4129, + "step": 198 + }, + { + "epoch": 1.3888647237929534, + "grad_norm": 0.3612694249655593, + "learning_rate": 7.254348575131328e-05, + "loss": 0.4106, + "step": 199 + }, + { + "epoch": 1.3958242714223577, + "grad_norm": 0.2763726751232723, + "learning_rate": 7.242946436597518e-05, + "loss": 0.4116, + "step": 200 + }, + { + "epoch": 1.4027838190517616, + "grad_norm": 0.37611767937513824, + "learning_rate": 7.231466884562275e-05, + "loss": 0.4086, + "step": 201 + }, + { + "epoch": 1.4097433666811656, + "grad_norm": 0.5071472574190108, + "learning_rate": 7.21991019305798e-05, + "loss": 0.411, + "step": 202 + }, + { + "epoch": 1.4167029143105698, + "grad_norm": 0.4340052816154503, + "learning_rate": 7.20827663795843e-05, + "loss": 0.4079, + "step": 203 + }, + { + "epoch": 1.423662461939974, + "grad_norm": 0.3712380468999662, + "learning_rate": 7.19656649697226e-05, + "loss": 0.4056, + "step": 204 + }, + { + "epoch": 1.430622009569378, + "grad_norm": 0.3225866140544311, + "learning_rate": 7.184780049636318e-05, + "loss": 0.4062, + "step": 205 + }, + { + "epoch": 1.437581557198782, + "grad_norm": 0.3106099934478539, + "learning_rate": 7.172917577308984e-05, + "loss": 0.4062, + "step": 206 + }, + { + "epoch": 1.4445411048281862, + "grad_norm": 0.4355158685775427, + "learning_rate": 7.160979363163456e-05, + "loss": 0.4142, + "step": 207 + }, + { + "epoch": 1.4515006524575902, + "grad_norm": 0.4793783049280065, + "learning_rate": 7.148965692180994e-05, + "loss": 0.399, + "step": 208 + }, + { + "epoch": 1.4584602000869944, + "grad_norm": 0.3726750358936686, + "learning_rate": 7.136876851144113e-05, + "loss": 0.4132, + "step": 209 + }, + { + "epoch": 1.4654197477163984, + "grad_norm": 0.31513705424244864, + "learning_rate": 7.124713128629739e-05, + "loss": 0.4058, + "step": 210 + }, + { + "epoch": 1.4723792953458026, + "grad_norm": 0.3159413572790412, + "learning_rate": 7.11247481500232e-05, + "loss": 0.4041, + "step": 211 + }, + { + "epoch": 1.4793388429752066, + "grad_norm": 0.33778666639216565, + "learning_rate": 7.100162202406891e-05, + "loss": 0.4147, + "step": 212 + }, + { + "epoch": 1.4862983906046108, + "grad_norm": 0.3268790661878625, + "learning_rate": 7.08777558476211e-05, + "loss": 0.4086, + "step": 213 + }, + { + "epoch": 1.4932579382340148, + "grad_norm": 0.32242776651704286, + "learning_rate": 7.075315257753229e-05, + "loss": 0.4148, + "step": 214 + }, + { + "epoch": 1.500217485863419, + "grad_norm": 0.31567118569033087, + "learning_rate": 7.062781518825047e-05, + "loss": 0.4137, + "step": 215 + }, + { + "epoch": 1.507177033492823, + "grad_norm": 0.3612184397000591, + "learning_rate": 7.050174667174799e-05, + "loss": 0.4097, + "step": 216 + }, + { + "epoch": 1.514136581122227, + "grad_norm": 0.4218662687852862, + "learning_rate": 7.037495003745024e-05, + "loss": 0.4084, + "step": 217 + }, + { + "epoch": 1.5210961287516311, + "grad_norm": 0.45144908695802316, + "learning_rate": 7.024742831216374e-05, + "loss": 0.4123, + "step": 218 + }, + { + "epoch": 1.5280556763810353, + "grad_norm": 0.4338699229225814, + "learning_rate": 7.011918454000391e-05, + "loss": 0.41, + "step": 219 + }, + { + "epoch": 1.5350152240104393, + "grad_norm": 0.4218475334427419, + "learning_rate": 6.99902217823224e-05, + "loss": 0.4099, + "step": 220 + }, + { + "epoch": 1.5419747716398433, + "grad_norm": 0.5384078834636578, + "learning_rate": 6.986054311763402e-05, + "loss": 0.4115, + "step": 221 + }, + { + "epoch": 1.5489343192692475, + "grad_norm": 0.5695524389564858, + "learning_rate": 6.973015164154326e-05, + "loss": 0.4057, + "step": 222 + }, + { + "epoch": 1.5558938668986517, + "grad_norm": 0.5296732996163314, + "learning_rate": 6.959905046667035e-05, + "loss": 0.4163, + "step": 223 + }, + { + "epoch": 1.5628534145280557, + "grad_norm": 0.5381554961428682, + "learning_rate": 6.946724272257699e-05, + "loss": 0.4125, + "step": 224 + }, + { + "epoch": 1.5698129621574597, + "grad_norm": 0.5541446269357019, + "learning_rate": 6.933473155569165e-05, + "loss": 0.4166, + "step": 225 + }, + { + "epoch": 1.576772509786864, + "grad_norm": 0.5249105733906559, + "learning_rate": 6.920152012923446e-05, + "loss": 0.4159, + "step": 226 + }, + { + "epoch": 1.583732057416268, + "grad_norm": 0.4702525223746907, + "learning_rate": 6.906761162314165e-05, + "loss": 0.4081, + "step": 227 + }, + { + "epoch": 1.590691605045672, + "grad_norm": 0.4018848611353379, + "learning_rate": 6.893300923398974e-05, + "loss": 0.4095, + "step": 228 + }, + { + "epoch": 1.597651152675076, + "grad_norm": 0.4387137463620228, + "learning_rate": 6.879771617491912e-05, + "loss": 0.4038, + "step": 229 + }, + { + "epoch": 1.6046107003044803, + "grad_norm": 0.5018137502153527, + "learning_rate": 6.866173567555743e-05, + "loss": 0.4007, + "step": 230 + }, + { + "epoch": 1.6115702479338843, + "grad_norm": 0.4243706106137811, + "learning_rate": 6.852507098194242e-05, + "loss": 0.4087, + "step": 231 + }, + { + "epoch": 1.6185297955632882, + "grad_norm": 0.3061005646641186, + "learning_rate": 6.838772535644451e-05, + "loss": 0.4062, + "step": 232 + }, + { + "epoch": 1.6254893431926924, + "grad_norm": 0.3094567455663344, + "learning_rate": 6.824970207768882e-05, + "loss": 0.4056, + "step": 233 + }, + { + "epoch": 1.6324488908220967, + "grad_norm": 0.27807602474077575, + "learning_rate": 6.811100444047704e-05, + "loss": 0.4026, + "step": 234 + }, + { + "epoch": 1.6394084384515006, + "grad_norm": 0.23900746560867425, + "learning_rate": 6.797163575570866e-05, + "loss": 0.4087, + "step": 235 + }, + { + "epoch": 1.6463679860809046, + "grad_norm": 0.29199115797045505, + "learning_rate": 6.783159935030197e-05, + "loss": 0.4027, + "step": 236 + }, + { + "epoch": 1.6533275337103088, + "grad_norm": 0.3184667675259614, + "learning_rate": 6.76908985671147e-05, + "loss": 0.4041, + "step": 237 + }, + { + "epoch": 1.660287081339713, + "grad_norm": 0.29291241249140837, + "learning_rate": 6.754953676486415e-05, + "loss": 0.4079, + "step": 238 + }, + { + "epoch": 1.667246628969117, + "grad_norm": 0.2840221245598121, + "learning_rate": 6.740751731804699e-05, + "loss": 0.4011, + "step": 239 + }, + { + "epoch": 1.674206176598521, + "grad_norm": 0.27939828055160454, + "learning_rate": 6.726484361685882e-05, + "loss": 0.4019, + "step": 240 + }, + { + "epoch": 1.6811657242279252, + "grad_norm": 0.256609914729704, + "learning_rate": 6.712151906711314e-05, + "loss": 0.4048, + "step": 241 + }, + { + "epoch": 1.6881252718573294, + "grad_norm": 0.23745107974586638, + "learning_rate": 6.697754709016009e-05, + "loss": 0.4058, + "step": 242 + }, + { + "epoch": 1.6950848194867334, + "grad_norm": 0.2482380115074257, + "learning_rate": 6.683293112280475e-05, + "loss": 0.3967, + "step": 243 + }, + { + "epoch": 1.7020443671161374, + "grad_norm": 0.28622361734552443, + "learning_rate": 6.668767461722518e-05, + "loss": 0.4061, + "step": 244 + }, + { + "epoch": 1.7090039147455416, + "grad_norm": 0.32770363568621996, + "learning_rate": 6.654178104088987e-05, + "loss": 0.4033, + "step": 245 + }, + { + "epoch": 1.7159634623749458, + "grad_norm": 0.3279702359191176, + "learning_rate": 6.639525387647508e-05, + "loss": 0.4059, + "step": 246 + }, + { + "epoch": 1.7229230100043496, + "grad_norm": 0.3773747339194689, + "learning_rate": 6.62480966217817e-05, + "loss": 0.407, + "step": 247 + }, + { + "epoch": 1.7298825576337538, + "grad_norm": 0.2998244869603879, + "learning_rate": 6.610031278965168e-05, + "loss": 0.4064, + "step": 248 + }, + { + "epoch": 1.736842105263158, + "grad_norm": 0.28510557083266236, + "learning_rate": 6.595190590788424e-05, + "loss": 0.4005, + "step": 249 + }, + { + "epoch": 1.743801652892562, + "grad_norm": 0.2932273577434916, + "learning_rate": 6.580287951915163e-05, + "loss": 0.4021, + "step": 250 + }, + { + "epoch": 1.750761200521966, + "grad_norm": 0.36298879202671214, + "learning_rate": 6.565323718091459e-05, + "loss": 0.4004, + "step": 251 + }, + { + "epoch": 1.7577207481513701, + "grad_norm": 0.3557701177490779, + "learning_rate": 6.550298246533735e-05, + "loss": 0.4071, + "step": 252 + }, + { + "epoch": 1.7646802957807743, + "grad_norm": 0.28338303699075845, + "learning_rate": 6.535211895920247e-05, + "loss": 0.4066, + "step": 253 + }, + { + "epoch": 1.7716398434101783, + "grad_norm": 0.24886657367644255, + "learning_rate": 6.520065026382511e-05, + "loss": 0.3955, + "step": 254 + }, + { + "epoch": 1.7785993910395823, + "grad_norm": 0.24062443339702383, + "learning_rate": 6.504857999496718e-05, + "loss": 0.406, + "step": 255 + }, + { + "epoch": 1.7855589386689865, + "grad_norm": 0.26443097823193684, + "learning_rate": 6.489591178275087e-05, + "loss": 0.4044, + "step": 256 + }, + { + "epoch": 1.7925184862983907, + "grad_norm": 0.3060220331852048, + "learning_rate": 6.474264927157216e-05, + "loss": 0.404, + "step": 257 + }, + { + "epoch": 1.7994780339277947, + "grad_norm": 0.32526179480394124, + "learning_rate": 6.45887961200137e-05, + "loss": 0.4009, + "step": 258 + }, + { + "epoch": 1.8064375815571987, + "grad_norm": 0.321679972746841, + "learning_rate": 6.443435600075757e-05, + "loss": 0.4056, + "step": 259 + }, + { + "epoch": 1.813397129186603, + "grad_norm": 0.3488328000864295, + "learning_rate": 6.42793326004975e-05, + "loss": 0.4049, + "step": 260 + }, + { + "epoch": 1.820356676816007, + "grad_norm": 0.34796885320723775, + "learning_rate": 6.412372961985097e-05, + "loss": 0.4048, + "step": 261 + }, + { + "epoch": 1.827316224445411, + "grad_norm": 0.36888125998311183, + "learning_rate": 6.396755077327081e-05, + "loss": 0.4132, + "step": 262 + }, + { + "epoch": 1.834275772074815, + "grad_norm": 0.42541614404670686, + "learning_rate": 6.381079978895654e-05, + "loss": 0.4026, + "step": 263 + }, + { + "epoch": 1.8412353197042193, + "grad_norm": 0.47952057555190714, + "learning_rate": 6.365348040876538e-05, + "loss": 0.4081, + "step": 264 + }, + { + "epoch": 1.8481948673336233, + "grad_norm": 0.456755628226278, + "learning_rate": 6.349559638812292e-05, + "loss": 0.4002, + "step": 265 + }, + { + "epoch": 1.8551544149630272, + "grad_norm": 0.3949978072972249, + "learning_rate": 6.333715149593351e-05, + "loss": 0.4048, + "step": 266 + }, + { + "epoch": 1.8621139625924314, + "grad_norm": 0.3684718285439735, + "learning_rate": 6.317814951449022e-05, + "loss": 0.4044, + "step": 267 + }, + { + "epoch": 1.8690735102218357, + "grad_norm": 0.28515408422430655, + "learning_rate": 6.301859423938463e-05, + "loss": 0.4021, + "step": 268 + }, + { + "epoch": 1.8760330578512396, + "grad_norm": 0.31596820921849417, + "learning_rate": 6.285848947941612e-05, + "loss": 0.3998, + "step": 269 + }, + { + "epoch": 1.8829926054806436, + "grad_norm": 0.3692222807296291, + "learning_rate": 6.26978390565011e-05, + "loss": 0.4061, + "step": 270 + }, + { + "epoch": 1.8899521531100478, + "grad_norm": 0.36126506035909883, + "learning_rate": 6.253664680558164e-05, + "loss": 0.4081, + "step": 271 + }, + { + "epoch": 1.896911700739452, + "grad_norm": 0.2273722979571688, + "learning_rate": 6.237491657453396e-05, + "loss": 0.4013, + "step": 272 + }, + { + "epoch": 1.903871248368856, + "grad_norm": 0.2249012460180903, + "learning_rate": 6.221265222407663e-05, + "loss": 0.4073, + "step": 273 + }, + { + "epoch": 1.91083079599826, + "grad_norm": 0.26278543615349254, + "learning_rate": 6.204985762767835e-05, + "loss": 0.3981, + "step": 274 + }, + { + "epoch": 1.9177903436276642, + "grad_norm": 0.23928206927128529, + "learning_rate": 6.188653667146551e-05, + "loss": 0.4005, + "step": 275 + }, + { + "epoch": 1.9247498912570684, + "grad_norm": 0.28051087087380006, + "learning_rate": 6.172269325412941e-05, + "loss": 0.4047, + "step": 276 + }, + { + "epoch": 1.9317094388864724, + "grad_norm": 0.2991219685083212, + "learning_rate": 6.15583312868332e-05, + "loss": 0.4093, + "step": 277 + }, + { + "epoch": 1.9386689865158764, + "grad_norm": 0.3242054335075169, + "learning_rate": 6.139345469311855e-05, + "loss": 0.4114, + "step": 278 + }, + { + "epoch": 1.9456285341452806, + "grad_norm": 0.3988144729091857, + "learning_rate": 6.122806740881191e-05, + "loss": 0.4081, + "step": 279 + }, + { + "epoch": 1.9525880817746848, + "grad_norm": 0.5183725945183102, + "learning_rate": 6.10621733819306e-05, + "loss": 0.4048, + "step": 280 + }, + { + "epoch": 1.9595476294040888, + "grad_norm": 0.6027727793849302, + "learning_rate": 6.089577657258863e-05, + "loss": 0.3972, + "step": 281 + }, + { + "epoch": 1.9665071770334928, + "grad_norm": 0.6004923341243484, + "learning_rate": 6.0728880952902056e-05, + "loss": 0.3993, + "step": 282 + }, + { + "epoch": 1.973466724662897, + "grad_norm": 0.5380310616547116, + "learning_rate": 6.056149050689419e-05, + "loss": 0.3982, + "step": 283 + }, + { + "epoch": 1.980426272292301, + "grad_norm": 0.3634273851516844, + "learning_rate": 6.039360923040059e-05, + "loss": 0.4051, + "step": 284 + }, + { + "epoch": 1.987385819921705, + "grad_norm": 0.24841398249046384, + "learning_rate": 6.0225241130973506e-05, + "loss": 0.4044, + "step": 285 + }, + { + "epoch": 1.9943453675511091, + "grad_norm": 0.31467866229451, + "learning_rate": 6.0056390227786366e-05, + "loss": 0.4052, + "step": 286 + }, + { + "epoch": 2.005219660722053, + "grad_norm": 0.3996733942417249, + "learning_rate": 5.9887060551537774e-05, + "loss": 0.3765, + "step": 287 + }, + { + "epoch": 2.012179208351457, + "grad_norm": 0.3755337827846155, + "learning_rate": 5.971725614435529e-05, + "loss": 0.367, + "step": 288 + }, + { + "epoch": 2.0191387559808613, + "grad_norm": 0.37714284296853545, + "learning_rate": 5.95469810596989e-05, + "loss": 0.372, + "step": 289 + }, + { + "epoch": 2.0260983036102655, + "grad_norm": 0.34335888494135713, + "learning_rate": 5.937623936226435e-05, + "loss": 0.3655, + "step": 290 + }, + { + "epoch": 2.0330578512396693, + "grad_norm": 0.3214731030718956, + "learning_rate": 5.9205035127886026e-05, + "loss": 0.3596, + "step": 291 + }, + { + "epoch": 2.0400173988690735, + "grad_norm": 0.3037344199720621, + "learning_rate": 5.903337244343972e-05, + "loss": 0.365, + "step": 292 + }, + { + "epoch": 2.0469769464984777, + "grad_norm": 0.29558276490238256, + "learning_rate": 5.8861255406745e-05, + "loss": 0.3655, + "step": 293 + }, + { + "epoch": 2.0539364941278815, + "grad_norm": 0.3239986635139212, + "learning_rate": 5.8688688126467514e-05, + "loss": 0.3737, + "step": 294 + }, + { + "epoch": 2.0608960417572857, + "grad_norm": 0.5323428158934608, + "learning_rate": 5.8515674722020745e-05, + "loss": 0.3691, + "step": 295 + }, + { + "epoch": 2.06785558938669, + "grad_norm": 0.6310404829026035, + "learning_rate": 5.834221932346781e-05, + "loss": 0.3742, + "step": 296 + }, + { + "epoch": 2.074815137016094, + "grad_norm": 0.4806664035734928, + "learning_rate": 5.8168326071422815e-05, + "loss": 0.3655, + "step": 297 + }, + { + "epoch": 2.081774684645498, + "grad_norm": 0.3081849802706308, + "learning_rate": 5.799399911695201e-05, + "loss": 0.3759, + "step": 298 + }, + { + "epoch": 2.088734232274902, + "grad_norm": 0.384198968174172, + "learning_rate": 5.781924262147471e-05, + "loss": 0.3618, + "step": 299 + }, + { + "epoch": 2.0956937799043063, + "grad_norm": 0.3402446023172264, + "learning_rate": 5.7644060756663954e-05, + "loss": 0.3706, + "step": 300 + }, + { + "epoch": 2.1026533275337105, + "grad_norm": 0.2804847723920022, + "learning_rate": 5.746845770434692e-05, + "loss": 0.3645, + "step": 301 + }, + { + "epoch": 2.1096128751631142, + "grad_norm": 0.33735637577255784, + "learning_rate": 5.7292437656405094e-05, + "loss": 0.3694, + "step": 302 + }, + { + "epoch": 2.1165724227925184, + "grad_norm": 0.29830282676341935, + "learning_rate": 5.711600481467422e-05, + "loss": 0.3661, + "step": 303 + }, + { + "epoch": 2.1235319704219227, + "grad_norm": 0.26892805785374885, + "learning_rate": 5.693916339084397e-05, + "loss": 0.365, + "step": 304 + }, + { + "epoch": 2.130491518051327, + "grad_norm": 0.2924956628530246, + "learning_rate": 5.676191760635744e-05, + "loss": 0.3682, + "step": 305 + }, + { + "epoch": 2.1374510656807306, + "grad_norm": 0.3322840841302584, + "learning_rate": 5.6584271692310345e-05, + "loss": 0.3591, + "step": 306 + }, + { + "epoch": 2.144410613310135, + "grad_norm": 0.31939252409902796, + "learning_rate": 5.640622988935006e-05, + "loss": 0.366, + "step": 307 + }, + { + "epoch": 2.151370160939539, + "grad_norm": 0.3675500408960479, + "learning_rate": 5.6227796447574296e-05, + "loss": 0.3721, + "step": 308 + }, + { + "epoch": 2.1583297085689432, + "grad_norm": 0.4269918712695122, + "learning_rate": 5.604897562642979e-05, + "loss": 0.3691, + "step": 309 + }, + { + "epoch": 2.165289256198347, + "grad_norm": 0.32932381192451604, + "learning_rate": 5.58697716946105e-05, + "loss": 0.3695, + "step": 310 + }, + { + "epoch": 2.172248803827751, + "grad_norm": 0.23672816365722077, + "learning_rate": 5.5690188929955756e-05, + "loss": 0.3718, + "step": 311 + }, + { + "epoch": 2.1792083514571554, + "grad_norm": 0.2685885863825375, + "learning_rate": 5.5510231619348154e-05, + "loss": 0.3626, + "step": 312 + }, + { + "epoch": 2.186167899086559, + "grad_norm": 0.31894739343841294, + "learning_rate": 5.5329904058611195e-05, + "loss": 0.3696, + "step": 313 + }, + { + "epoch": 2.1931274467159634, + "grad_norm": 0.2690402159463182, + "learning_rate": 5.514921055240674e-05, + "loss": 0.3664, + "step": 314 + }, + { + "epoch": 2.2000869943453676, + "grad_norm": 0.36597319341247037, + "learning_rate": 5.4968155414132294e-05, + "loss": 0.3661, + "step": 315 + }, + { + "epoch": 2.207046541974772, + "grad_norm": 0.2509160139038948, + "learning_rate": 5.4786742965817964e-05, + "loss": 0.3737, + "step": 316 + }, + { + "epoch": 2.2140060896041756, + "grad_norm": 0.22135896086783516, + "learning_rate": 5.4604977538023375e-05, + "loss": 0.3651, + "step": 317 + }, + { + "epoch": 2.2209656372335798, + "grad_norm": 0.21117176336652171, + "learning_rate": 5.442286346973419e-05, + "loss": 0.3694, + "step": 318 + }, + { + "epoch": 2.227925184862984, + "grad_norm": 0.21238366241650145, + "learning_rate": 5.424040510825867e-05, + "loss": 0.3724, + "step": 319 + }, + { + "epoch": 2.234884732492388, + "grad_norm": 0.18338158702931512, + "learning_rate": 5.405760680912374e-05, + "loss": 0.3706, + "step": 320 + }, + { + "epoch": 2.241844280121792, + "grad_norm": 0.21207069234590878, + "learning_rate": 5.387447293597113e-05, + "loss": 0.3612, + "step": 321 + }, + { + "epoch": 2.248803827751196, + "grad_norm": 0.2196618454401931, + "learning_rate": 5.3691007860453185e-05, + "loss": 0.3706, + "step": 322 + }, + { + "epoch": 2.2557633753806003, + "grad_norm": 0.28738964583045895, + "learning_rate": 5.3507215962128485e-05, + "loss": 0.3665, + "step": 323 + }, + { + "epoch": 2.2627229230100045, + "grad_norm": 0.2285542763691958, + "learning_rate": 5.332310162835729e-05, + "loss": 0.371, + "step": 324 + }, + { + "epoch": 2.2696824706394083, + "grad_norm": 0.25316534475803026, + "learning_rate": 5.313866925419685e-05, + "loss": 0.368, + "step": 325 + }, + { + "epoch": 2.2766420182688125, + "grad_norm": 0.30304672804412763, + "learning_rate": 5.295392324229648e-05, + "loss": 0.3681, + "step": 326 + }, + { + "epoch": 2.2836015658982167, + "grad_norm": 0.31451674332345964, + "learning_rate": 5.276886800279243e-05, + "loss": 0.367, + "step": 327 + }, + { + "epoch": 2.2905611135276205, + "grad_norm": 0.3565352812109271, + "learning_rate": 5.2583507953202654e-05, + "loss": 0.3689, + "step": 328 + }, + { + "epoch": 2.2975206611570247, + "grad_norm": 0.2608640444128353, + "learning_rate": 5.239784751832128e-05, + "loss": 0.3708, + "step": 329 + }, + { + "epoch": 2.304480208786429, + "grad_norm": 0.17931678523987182, + "learning_rate": 5.221189113011309e-05, + "loss": 0.3681, + "step": 330 + }, + { + "epoch": 2.311439756415833, + "grad_norm": 0.24872188995030328, + "learning_rate": 5.2025643227607656e-05, + "loss": 0.366, + "step": 331 + }, + { + "epoch": 2.3183993040452373, + "grad_norm": 0.2145763484568399, + "learning_rate": 5.18391082567934e-05, + "loss": 0.3608, + "step": 332 + }, + { + "epoch": 2.325358851674641, + "grad_norm": 0.2164912354682701, + "learning_rate": 5.1652290670511396e-05, + "loss": 0.3715, + "step": 333 + }, + { + "epoch": 2.3323183993040453, + "grad_norm": 0.2248326733820592, + "learning_rate": 5.1465194928349215e-05, + "loss": 0.3723, + "step": 334 + }, + { + "epoch": 2.3392779469334495, + "grad_norm": 0.23611123081541333, + "learning_rate": 5.127782549653431e-05, + "loss": 0.368, + "step": 335 + }, + { + "epoch": 2.3462374945628532, + "grad_norm": 0.27552200066945187, + "learning_rate": 5.1090186847827535e-05, + "loss": 0.3681, + "step": 336 + }, + { + "epoch": 2.3531970421922574, + "grad_norm": 0.2473752934953281, + "learning_rate": 5.090228346141626e-05, + "loss": 0.3705, + "step": 337 + }, + { + "epoch": 2.3601565898216617, + "grad_norm": 0.1964697404165815, + "learning_rate": 5.071411982280754e-05, + "loss": 0.3694, + "step": 338 + }, + { + "epoch": 2.367116137451066, + "grad_norm": 0.18565136863582019, + "learning_rate": 5.0525700423720964e-05, + "loss": 0.3676, + "step": 339 + }, + { + "epoch": 2.3740756850804696, + "grad_norm": 0.2127557318833584, + "learning_rate": 5.033702976198154e-05, + "loss": 0.3652, + "step": 340 + }, + { + "epoch": 2.381035232709874, + "grad_norm": 0.22768416565210678, + "learning_rate": 5.0148112341412155e-05, + "loss": 0.3627, + "step": 341 + }, + { + "epoch": 2.387994780339278, + "grad_norm": 0.20405791268703083, + "learning_rate": 4.9958952671726214e-05, + "loss": 0.3645, + "step": 342 + }, + { + "epoch": 2.394954327968682, + "grad_norm": 0.18709352719221842, + "learning_rate": 4.976955526841995e-05, + "loss": 0.3744, + "step": 343 + }, + { + "epoch": 2.401913875598086, + "grad_norm": 0.2257696533468009, + "learning_rate": 4.9579924652664624e-05, + "loss": 0.3659, + "step": 344 + }, + { + "epoch": 2.40887342322749, + "grad_norm": 0.22579970351795095, + "learning_rate": 4.939006535119851e-05, + "loss": 0.3721, + "step": 345 + }, + { + "epoch": 2.4158329708568944, + "grad_norm": 0.1695044273561281, + "learning_rate": 4.919998189621902e-05, + "loss": 0.3717, + "step": 346 + }, + { + "epoch": 2.4227925184862986, + "grad_norm": 0.1868306496768378, + "learning_rate": 4.9009678825274344e-05, + "loss": 0.37, + "step": 347 + }, + { + "epoch": 2.4297520661157024, + "grad_norm": 0.203208513247505, + "learning_rate": 4.8819160681155245e-05, + "loss": 0.3687, + "step": 348 + }, + { + "epoch": 2.4367116137451066, + "grad_norm": 0.18225921597742265, + "learning_rate": 4.8628432011786536e-05, + "loss": 0.3722, + "step": 349 + }, + { + "epoch": 2.443671161374511, + "grad_norm": 0.19704873995226552, + "learning_rate": 4.843749737011858e-05, + "loss": 0.3767, + "step": 350 + }, + { + "epoch": 2.4506307090039146, + "grad_norm": 0.17968261581935796, + "learning_rate": 4.8246361314018566e-05, + "loss": 0.3674, + "step": 351 + }, + { + "epoch": 2.4575902566333188, + "grad_norm": 0.16849328618057585, + "learning_rate": 4.805502840616171e-05, + "loss": 0.3676, + "step": 352 + }, + { + "epoch": 2.464549804262723, + "grad_norm": 0.17392306484481443, + "learning_rate": 4.786350321392237e-05, + "loss": 0.3598, + "step": 353 + }, + { + "epoch": 2.471509351892127, + "grad_norm": 0.15047667342669488, + "learning_rate": 4.767179030926492e-05, + "loss": 0.3626, + "step": 354 + }, + { + "epoch": 2.478468899521531, + "grad_norm": 0.18466737228109523, + "learning_rate": 4.7479894268634794e-05, + "loss": 0.3644, + "step": 355 + }, + { + "epoch": 2.485428447150935, + "grad_norm": 0.19259024117568577, + "learning_rate": 4.728781967284904e-05, + "loss": 0.3666, + "step": 356 + }, + { + "epoch": 2.4923879947803393, + "grad_norm": 0.1658133386266871, + "learning_rate": 4.7095571106987096e-05, + "loss": 0.3706, + "step": 357 + }, + { + "epoch": 2.4993475424097435, + "grad_norm": 0.1857293176933773, + "learning_rate": 4.6903153160281266e-05, + "loss": 0.3658, + "step": 358 + }, + { + "epoch": 2.5063070900391473, + "grad_norm": 0.1780806265365307, + "learning_rate": 4.671057042600728e-05, + "loss": 0.37, + "step": 359 + }, + { + "epoch": 2.5132666376685515, + "grad_norm": 0.18451738808389523, + "learning_rate": 4.6517827501374466e-05, + "loss": 0.367, + "step": 360 + }, + { + "epoch": 2.5202261852979557, + "grad_norm": 0.19964181415078697, + "learning_rate": 4.632492898741619e-05, + "loss": 0.3679, + "step": 361 + }, + { + "epoch": 2.52718573292736, + "grad_norm": 0.20121807599019395, + "learning_rate": 4.61318794888799e-05, + "loss": 0.3664, + "step": 362 + }, + { + "epoch": 2.5341452805567637, + "grad_norm": 0.1711607561791567, + "learning_rate": 4.593868361411729e-05, + "loss": 0.3719, + "step": 363 + }, + { + "epoch": 2.541104828186168, + "grad_norm": 0.20920063909952305, + "learning_rate": 4.57453459749742e-05, + "loss": 0.3677, + "step": 364 + }, + { + "epoch": 2.548064375815572, + "grad_norm": 0.20148754034737523, + "learning_rate": 4.555187118668064e-05, + "loss": 0.3715, + "step": 365 + }, + { + "epoch": 2.555023923444976, + "grad_norm": 0.16950935825604083, + "learning_rate": 4.53582638677405e-05, + "loss": 0.3668, + "step": 366 + }, + { + "epoch": 2.56198347107438, + "grad_norm": 0.18835333573041302, + "learning_rate": 4.516452863982138e-05, + "loss": 0.3642, + "step": 367 + }, + { + "epoch": 2.5689430187037843, + "grad_norm": 0.19215610553252904, + "learning_rate": 4.497067012764423e-05, + "loss": 0.3691, + "step": 368 + }, + { + "epoch": 2.5759025663331885, + "grad_norm": 0.14122968503331657, + "learning_rate": 4.477669295887299e-05, + "loss": 0.3682, + "step": 369 + }, + { + "epoch": 2.5828621139625927, + "grad_norm": 0.21501799625798765, + "learning_rate": 4.458260176400404e-05, + "loss": 0.3778, + "step": 370 + }, + { + "epoch": 2.5898216615919964, + "grad_norm": 0.19997905382170258, + "learning_rate": 4.4388401176255765e-05, + "loss": 0.367, + "step": 371 + }, + { + "epoch": 2.5967812092214007, + "grad_norm": 0.16385556465949214, + "learning_rate": 4.419409583145787e-05, + "loss": 0.3671, + "step": 372 + }, + { + "epoch": 2.603740756850805, + "grad_norm": 0.21677818126974885, + "learning_rate": 4.3999690367940796e-05, + "loss": 0.3685, + "step": 373 + }, + { + "epoch": 2.6107003044802086, + "grad_norm": 0.20763772842399864, + "learning_rate": 4.3805189426424895e-05, + "loss": 0.3637, + "step": 374 + }, + { + "epoch": 2.617659852109613, + "grad_norm": 0.15620005918797172, + "learning_rate": 4.361059764990977e-05, + "loss": 0.3612, + "step": 375 + }, + { + "epoch": 2.624619399739017, + "grad_norm": 0.21129929527164582, + "learning_rate": 4.341591968356332e-05, + "loss": 0.36, + "step": 376 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.2263848905453181, + "learning_rate": 4.322116017461094e-05, + "loss": 0.367, + "step": 377 + }, + { + "epoch": 2.638538494997825, + "grad_norm": 0.19202782447376043, + "learning_rate": 4.3026323772224564e-05, + "loss": 0.3741, + "step": 378 + }, + { + "epoch": 2.645498042627229, + "grad_norm": 0.21118865625881889, + "learning_rate": 4.283141512741168e-05, + "loss": 0.3613, + "step": 379 + }, + { + "epoch": 2.6524575902566334, + "grad_norm": 0.22052270512412744, + "learning_rate": 4.263643889290425e-05, + "loss": 0.3772, + "step": 380 + }, + { + "epoch": 2.659417137886037, + "grad_norm": 0.22167433442606896, + "learning_rate": 4.244139972304775e-05, + "loss": 0.3679, + "step": 381 + }, + { + "epoch": 2.6663766855154414, + "grad_norm": 0.1649098717811062, + "learning_rate": 4.224630227368998e-05, + "loss": 0.37, + "step": 382 + }, + { + "epoch": 2.6733362331448456, + "grad_norm": 0.16002894116004632, + "learning_rate": 4.2051151202069976e-05, + "loss": 0.3687, + "step": 383 + }, + { + "epoch": 2.68029578077425, + "grad_norm": 0.20116234602993452, + "learning_rate": 4.1855951166706783e-05, + "loss": 0.3658, + "step": 384 + }, + { + "epoch": 2.687255328403654, + "grad_norm": 0.1867737945734333, + "learning_rate": 4.166070682728826e-05, + "loss": 0.3636, + "step": 385 + }, + { + "epoch": 2.6942148760330578, + "grad_norm": 0.20649674616661776, + "learning_rate": 4.1465422844559914e-05, + "loss": 0.369, + "step": 386 + }, + { + "epoch": 2.701174423662462, + "grad_norm": 0.2019593756587488, + "learning_rate": 4.127010388021355e-05, + "loss": 0.3707, + "step": 387 + }, + { + "epoch": 2.708133971291866, + "grad_norm": 0.18670876706954212, + "learning_rate": 4.1074754596776076e-05, + "loss": 0.3723, + "step": 388 + }, + { + "epoch": 2.71509351892127, + "grad_norm": 0.1760226934500404, + "learning_rate": 4.087937965749808e-05, + "loss": 0.3652, + "step": 389 + }, + { + "epoch": 2.722053066550674, + "grad_norm": 0.19645887620326913, + "learning_rate": 4.068398372624268e-05, + "loss": 0.3599, + "step": 390 + }, + { + "epoch": 2.7290126141800783, + "grad_norm": 0.19578274938214618, + "learning_rate": 4.0488571467374035e-05, + "loss": 0.3581, + "step": 391 + }, + { + "epoch": 2.7359721618094826, + "grad_norm": 0.18492474565139713, + "learning_rate": 4.02931475456461e-05, + "loss": 0.3685, + "step": 392 + }, + { + "epoch": 2.7429317094388863, + "grad_norm": 0.19837302215210767, + "learning_rate": 4.009771662609122e-05, + "loss": 0.3672, + "step": 393 + }, + { + "epoch": 2.7498912570682905, + "grad_norm": 0.19983845339321854, + "learning_rate": 3.990228337390879e-05, + "loss": 0.37, + "step": 394 + }, + { + "epoch": 2.7568508046976947, + "grad_norm": 0.21784022283680937, + "learning_rate": 3.970685245435391e-05, + "loss": 0.3654, + "step": 395 + }, + { + "epoch": 2.7638103523270985, + "grad_norm": 0.1710051588696074, + "learning_rate": 3.951142853262598e-05, + "loss": 0.3672, + "step": 396 + }, + { + "epoch": 2.7707698999565027, + "grad_norm": 0.1838813552978722, + "learning_rate": 3.931601627375733e-05, + "loss": 0.3657, + "step": 397 + }, + { + "epoch": 2.777729447585907, + "grad_norm": 0.16862615564852854, + "learning_rate": 3.9120620342501934e-05, + "loss": 0.3638, + "step": 398 + }, + { + "epoch": 2.784688995215311, + "grad_norm": 0.1799106397970807, + "learning_rate": 3.8925245403223944e-05, + "loss": 0.3643, + "step": 399 + }, + { + "epoch": 2.7916485428447153, + "grad_norm": 0.15331361673144303, + "learning_rate": 3.872989611978644e-05, + "loss": 0.3629, + "step": 400 + }, + { + "epoch": 2.798608090474119, + "grad_norm": 0.15271887998704453, + "learning_rate": 3.85345771554401e-05, + "loss": 0.3652, + "step": 401 + }, + { + "epoch": 2.8055676381035233, + "grad_norm": 0.15620132253760882, + "learning_rate": 3.833929317271175e-05, + "loss": 0.3602, + "step": 402 + }, + { + "epoch": 2.8125271857329275, + "grad_norm": 0.15541759415132128, + "learning_rate": 3.814404883329324e-05, + "loss": 0.3696, + "step": 403 + }, + { + "epoch": 2.8194867333623312, + "grad_norm": 0.1614291271877618, + "learning_rate": 3.794884879793004e-05, + "loss": 0.3657, + "step": 404 + }, + { + "epoch": 2.8264462809917354, + "grad_norm": 0.13456071859787921, + "learning_rate": 3.7753697726310026e-05, + "loss": 0.3646, + "step": 405 + }, + { + "epoch": 2.8334058286211397, + "grad_norm": 0.1451353124821423, + "learning_rate": 3.755860027695225e-05, + "loss": 0.3706, + "step": 406 + }, + { + "epoch": 2.840365376250544, + "grad_norm": 0.14928014331184716, + "learning_rate": 3.7363561107095765e-05, + "loss": 0.3677, + "step": 407 + }, + { + "epoch": 2.847324923879948, + "grad_norm": 0.13937655853284261, + "learning_rate": 3.7168584872588336e-05, + "loss": 0.3642, + "step": 408 + }, + { + "epoch": 2.854284471509352, + "grad_norm": 0.16336855650022625, + "learning_rate": 3.697367622777545e-05, + "loss": 0.3632, + "step": 409 + }, + { + "epoch": 2.861244019138756, + "grad_norm": 0.1545871776685994, + "learning_rate": 3.677883982538907e-05, + "loss": 0.3703, + "step": 410 + }, + { + "epoch": 2.86820356676816, + "grad_norm": 0.14861237645334818, + "learning_rate": 3.6584080316436696e-05, + "loss": 0.3632, + "step": 411 + }, + { + "epoch": 2.875163114397564, + "grad_norm": 0.1386742246058214, + "learning_rate": 3.638940235009025e-05, + "loss": 0.3691, + "step": 412 + }, + { + "epoch": 2.882122662026968, + "grad_norm": 0.1496262020436169, + "learning_rate": 3.619481057357511e-05, + "loss": 0.3649, + "step": 413 + }, + { + "epoch": 2.8890822096563724, + "grad_norm": 0.1274326314422379, + "learning_rate": 3.600030963205922e-05, + "loss": 0.3702, + "step": 414 + }, + { + "epoch": 2.8960417572857766, + "grad_norm": 0.14829018850055273, + "learning_rate": 3.580590416854214e-05, + "loss": 0.3641, + "step": 415 + }, + { + "epoch": 2.9030013049151804, + "grad_norm": 0.15397049410838218, + "learning_rate": 3.561159882374425e-05, + "loss": 0.3655, + "step": 416 + }, + { + "epoch": 2.9099608525445846, + "grad_norm": 0.149564207070461, + "learning_rate": 3.541739823599598e-05, + "loss": 0.3638, + "step": 417 + }, + { + "epoch": 2.916920400173989, + "grad_norm": 0.14607395090587116, + "learning_rate": 3.5223307041127025e-05, + "loss": 0.3675, + "step": 418 + }, + { + "epoch": 2.9238799478033926, + "grad_norm": 0.17923632050969646, + "learning_rate": 3.502932987235577e-05, + "loss": 0.369, + "step": 419 + }, + { + "epoch": 2.9308394954327968, + "grad_norm": 0.1538306766022074, + "learning_rate": 3.4835471360178626e-05, + "loss": 0.369, + "step": 420 + }, + { + "epoch": 2.937799043062201, + "grad_norm": 0.17364066496444236, + "learning_rate": 3.464173613225951e-05, + "loss": 0.3678, + "step": 421 + }, + { + "epoch": 2.944758590691605, + "grad_norm": 0.14792158024656513, + "learning_rate": 3.4448128813319365e-05, + "loss": 0.3706, + "step": 422 + }, + { + "epoch": 2.9517181383210094, + "grad_norm": 0.16703857041888837, + "learning_rate": 3.425465402502581e-05, + "loss": 0.3668, + "step": 423 + }, + { + "epoch": 2.958677685950413, + "grad_norm": 0.16247636991051975, + "learning_rate": 3.406131638588273e-05, + "loss": 0.3613, + "step": 424 + }, + { + "epoch": 2.9656372335798173, + "grad_norm": 0.14211396104389656, + "learning_rate": 3.386812051112011e-05, + "loss": 0.3678, + "step": 425 + }, + { + "epoch": 2.9725967812092216, + "grad_norm": 0.1664600329507516, + "learning_rate": 3.367507101258382e-05, + "loss": 0.359, + "step": 426 + }, + { + "epoch": 2.9795563288386253, + "grad_norm": 0.15867128125681806, + "learning_rate": 3.348217249862555e-05, + "loss": 0.3749, + "step": 427 + }, + { + "epoch": 2.9865158764680295, + "grad_norm": 0.12942815057080284, + "learning_rate": 3.328942957399274e-05, + "loss": 0.3692, + "step": 428 + }, + { + "epoch": 2.9934754240974337, + "grad_norm": 0.14130545610018397, + "learning_rate": 3.309684683971874e-05, + "loss": 0.3673, + "step": 429 + }, + { + "epoch": 3.0043497172683775, + "grad_norm": 0.16060728870914467, + "learning_rate": 3.2904428893012924e-05, + "loss": 0.3474, + "step": 430 + }, + { + "epoch": 3.0113092648977817, + "grad_norm": 0.1671503265033675, + "learning_rate": 3.2712180327150965e-05, + "loss": 0.3352, + "step": 431 + }, + { + "epoch": 3.018268812527186, + "grad_norm": 0.17602527650846453, + "learning_rate": 3.252010573136521e-05, + "loss": 0.3334, + "step": 432 + }, + { + "epoch": 3.0252283601565897, + "grad_norm": 0.19388540072595992, + "learning_rate": 3.2328209690735085e-05, + "loss": 0.3368, + "step": 433 + }, + { + "epoch": 3.032187907785994, + "grad_norm": 0.18871272314861076, + "learning_rate": 3.213649678607765e-05, + "loss": 0.3276, + "step": 434 + }, + { + "epoch": 3.039147455415398, + "grad_norm": 0.1825751881244417, + "learning_rate": 3.19449715938383e-05, + "loss": 0.3264, + "step": 435 + }, + { + "epoch": 3.046107003044802, + "grad_norm": 0.20536550174110904, + "learning_rate": 3.175363868598145e-05, + "loss": 0.3336, + "step": 436 + }, + { + "epoch": 3.053066550674206, + "grad_norm": 0.19028693715972822, + "learning_rate": 3.1562502629881435e-05, + "loss": 0.3361, + "step": 437 + }, + { + "epoch": 3.0600260983036103, + "grad_norm": 0.19144645634840593, + "learning_rate": 3.137156798821347e-05, + "loss": 0.3295, + "step": 438 + }, + { + "epoch": 3.0669856459330145, + "grad_norm": 0.15458500965875127, + "learning_rate": 3.118083931884477e-05, + "loss": 0.3325, + "step": 439 + }, + { + "epoch": 3.0739451935624182, + "grad_norm": 0.16190332070850957, + "learning_rate": 3.099032117472567e-05, + "loss": 0.324, + "step": 440 + }, + { + "epoch": 3.0809047411918224, + "grad_norm": 0.1931582699954326, + "learning_rate": 3.0800018103780997e-05, + "loss": 0.3319, + "step": 441 + }, + { + "epoch": 3.0878642888212267, + "grad_norm": 0.15544825033730889, + "learning_rate": 3.060993464880151e-05, + "loss": 0.3312, + "step": 442 + }, + { + "epoch": 3.094823836450631, + "grad_norm": 0.19712552027556793, + "learning_rate": 3.0420075347335403e-05, + "loss": 0.3358, + "step": 443 + }, + { + "epoch": 3.1017833840800346, + "grad_norm": 0.16525888661776322, + "learning_rate": 3.023044473158004e-05, + "loss": 0.3286, + "step": 444 + }, + { + "epoch": 3.108742931709439, + "grad_norm": 0.16251875749806674, + "learning_rate": 3.0041047328273786e-05, + "loss": 0.3371, + "step": 445 + }, + { + "epoch": 3.115702479338843, + "grad_norm": 0.14251069293212443, + "learning_rate": 2.9851887658587865e-05, + "loss": 0.3323, + "step": 446 + }, + { + "epoch": 3.1226620269682472, + "grad_norm": 0.14225759957337078, + "learning_rate": 2.9662970238018472e-05, + "loss": 0.3323, + "step": 447 + }, + { + "epoch": 3.129621574597651, + "grad_norm": 0.12972258276234128, + "learning_rate": 2.947429957627904e-05, + "loss": 0.3289, + "step": 448 + }, + { + "epoch": 3.136581122227055, + "grad_norm": 0.13682940985467507, + "learning_rate": 2.9285880177192475e-05, + "loss": 0.3265, + "step": 449 + }, + { + "epoch": 3.1435406698564594, + "grad_norm": 0.12277970656857606, + "learning_rate": 2.9097716538583746e-05, + "loss": 0.3282, + "step": 450 + }, + { + "epoch": 3.1505002174858636, + "grad_norm": 0.1543558782856671, + "learning_rate": 2.8909813152172472e-05, + "loss": 0.3335, + "step": 451 + }, + { + "epoch": 3.1574597651152674, + "grad_norm": 0.14314974220458332, + "learning_rate": 2.8722174503465697e-05, + "loss": 0.3367, + "step": 452 + }, + { + "epoch": 3.1644193127446716, + "grad_norm": 0.133141477333654, + "learning_rate": 2.8534805071650802e-05, + "loss": 0.3306, + "step": 453 + }, + { + "epoch": 3.171378860374076, + "grad_norm": 0.15733593589203457, + "learning_rate": 2.834770932948862e-05, + "loss": 0.334, + "step": 454 + }, + { + "epoch": 3.17833840800348, + "grad_norm": 0.14584273626489633, + "learning_rate": 2.816089174320663e-05, + "loss": 0.3325, + "step": 455 + }, + { + "epoch": 3.1852979556328838, + "grad_norm": 0.14481726585440186, + "learning_rate": 2.7974356772392347e-05, + "loss": 0.3381, + "step": 456 + }, + { + "epoch": 3.192257503262288, + "grad_norm": 0.16331363108391558, + "learning_rate": 2.7788108869886917e-05, + "loss": 0.334, + "step": 457 + }, + { + "epoch": 3.199217050891692, + "grad_norm": 0.11664377872886357, + "learning_rate": 2.7602152481678726e-05, + "loss": 0.3308, + "step": 458 + }, + { + "epoch": 3.206176598521096, + "grad_norm": 0.14495158600624636, + "learning_rate": 2.741649204679736e-05, + "loss": 0.3336, + "step": 459 + }, + { + "epoch": 3.2131361461505, + "grad_norm": 0.1229178497126909, + "learning_rate": 2.723113199720757e-05, + "loss": 0.3386, + "step": 460 + }, + { + "epoch": 3.2200956937799043, + "grad_norm": 0.12926743116724196, + "learning_rate": 2.7046076757703524e-05, + "loss": 0.3358, + "step": 461 + }, + { + "epoch": 3.2270552414093086, + "grad_norm": 0.1313312996235397, + "learning_rate": 2.6861330745803167e-05, + "loss": 0.3397, + "step": 462 + }, + { + "epoch": 3.2340147890387123, + "grad_norm": 0.1309779972337031, + "learning_rate": 2.6676898371642726e-05, + "loss": 0.3338, + "step": 463 + }, + { + "epoch": 3.2409743366681165, + "grad_norm": 0.11811653221059722, + "learning_rate": 2.6492784037871532e-05, + "loss": 0.3316, + "step": 464 + }, + { + "epoch": 3.2479338842975207, + "grad_norm": 0.1434333839230034, + "learning_rate": 2.6308992139546825e-05, + "loss": 0.3348, + "step": 465 + }, + { + "epoch": 3.254893431926925, + "grad_norm": 0.11122333012963419, + "learning_rate": 2.6125527064028874e-05, + "loss": 0.3351, + "step": 466 + }, + { + "epoch": 3.2618529795563287, + "grad_norm": 0.1403831175014317, + "learning_rate": 2.5942393190876268e-05, + "loss": 0.3301, + "step": 467 + }, + { + "epoch": 3.268812527185733, + "grad_norm": 0.11875350967381776, + "learning_rate": 2.5759594891741345e-05, + "loss": 0.3361, + "step": 468 + }, + { + "epoch": 3.275772074815137, + "grad_norm": 0.1352653946139823, + "learning_rate": 2.55771365302658e-05, + "loss": 0.3293, + "step": 469 + }, + { + "epoch": 3.2827316224445413, + "grad_norm": 0.12259046772867968, + "learning_rate": 2.539502246197663e-05, + "loss": 0.3317, + "step": 470 + }, + { + "epoch": 3.289691170073945, + "grad_norm": 0.1341784005406857, + "learning_rate": 2.5213257034182042e-05, + "loss": 0.3336, + "step": 471 + }, + { + "epoch": 3.2966507177033493, + "grad_norm": 0.13864288710375855, + "learning_rate": 2.503184458586772e-05, + "loss": 0.3368, + "step": 472 + }, + { + "epoch": 3.3036102653327535, + "grad_norm": 0.13118987129584506, + "learning_rate": 2.4850789447593276e-05, + "loss": 0.3367, + "step": 473 + }, + { + "epoch": 3.3105698129621572, + "grad_norm": 0.1377873050994517, + "learning_rate": 2.4670095941388822e-05, + "loss": 0.3388, + "step": 474 + }, + { + "epoch": 3.3175293605915614, + "grad_norm": 0.12322175327795074, + "learning_rate": 2.4489768380651856e-05, + "loss": 0.3333, + "step": 475 + }, + { + "epoch": 3.3244889082209657, + "grad_norm": 0.13741800125839718, + "learning_rate": 2.4309811070044247e-05, + "loss": 0.3327, + "step": 476 + }, + { + "epoch": 3.33144845585037, + "grad_norm": 0.11467772477556636, + "learning_rate": 2.4130228305389514e-05, + "loss": 0.329, + "step": 477 + }, + { + "epoch": 3.3384080034797736, + "grad_norm": 0.13098647128626187, + "learning_rate": 2.3951024373570214e-05, + "loss": 0.3373, + "step": 478 + }, + { + "epoch": 3.345367551109178, + "grad_norm": 0.10605964176743833, + "learning_rate": 2.3772203552425717e-05, + "loss": 0.3276, + "step": 479 + }, + { + "epoch": 3.352327098738582, + "grad_norm": 0.1369975822039289, + "learning_rate": 2.3593770110649966e-05, + "loss": 0.3287, + "step": 480 + }, + { + "epoch": 3.3592866463679862, + "grad_norm": 0.10746583370109318, + "learning_rate": 2.341572830768965e-05, + "loss": 0.3247, + "step": 481 + }, + { + "epoch": 3.36624619399739, + "grad_norm": 0.12619643852285148, + "learning_rate": 2.323808239364256e-05, + "loss": 0.3334, + "step": 482 + }, + { + "epoch": 3.373205741626794, + "grad_norm": 0.11299951719427818, + "learning_rate": 2.306083660915604e-05, + "loss": 0.3314, + "step": 483 + }, + { + "epoch": 3.3801652892561984, + "grad_norm": 0.12698843914556834, + "learning_rate": 2.2883995185325797e-05, + "loss": 0.3269, + "step": 484 + }, + { + "epoch": 3.3871248368856026, + "grad_norm": 0.11590705576436211, + "learning_rate": 2.2707562343594916e-05, + "loss": 0.3378, + "step": 485 + }, + { + "epoch": 3.3940843845150064, + "grad_norm": 0.12503790075980037, + "learning_rate": 2.2531542295653094e-05, + "loss": 0.336, + "step": 486 + }, + { + "epoch": 3.4010439321444106, + "grad_norm": 0.11611542174394521, + "learning_rate": 2.235593924333607e-05, + "loss": 0.3347, + "step": 487 + }, + { + "epoch": 3.408003479773815, + "grad_norm": 0.12221901254166186, + "learning_rate": 2.21807573785253e-05, + "loss": 0.3333, + "step": 488 + }, + { + "epoch": 3.4149630274032186, + "grad_norm": 0.12496755906276005, + "learning_rate": 2.2006000883048008e-05, + "loss": 0.331, + "step": 489 + }, + { + "epoch": 3.4219225750326228, + "grad_norm": 0.1160790840797104, + "learning_rate": 2.183167392857719e-05, + "loss": 0.3347, + "step": 490 + }, + { + "epoch": 3.428882122662027, + "grad_norm": 0.12271399113397469, + "learning_rate": 2.1657780676532205e-05, + "loss": 0.3371, + "step": 491 + }, + { + "epoch": 3.435841670291431, + "grad_norm": 0.11513590243752209, + "learning_rate": 2.1484325277979278e-05, + "loss": 0.3336, + "step": 492 + }, + { + "epoch": 3.4428012179208354, + "grad_norm": 0.13194006381727105, + "learning_rate": 2.1311311873532502e-05, + "loss": 0.3346, + "step": 493 + }, + { + "epoch": 3.449760765550239, + "grad_norm": 0.10982920455618568, + "learning_rate": 2.1138744593254997e-05, + "loss": 0.3304, + "step": 494 + }, + { + "epoch": 3.4567203131796433, + "grad_norm": 0.1485401459853815, + "learning_rate": 2.09666275565603e-05, + "loss": 0.3296, + "step": 495 + }, + { + "epoch": 3.4636798608090476, + "grad_norm": 0.12077637074742696, + "learning_rate": 2.0794964872113987e-05, + "loss": 0.3354, + "step": 496 + }, + { + "epoch": 3.4706394084384513, + "grad_norm": 0.12161585081850207, + "learning_rate": 2.062376063773567e-05, + "loss": 0.3273, + "step": 497 + }, + { + "epoch": 3.4775989560678555, + "grad_norm": 0.11631137052693763, + "learning_rate": 2.045301894030111e-05, + "loss": 0.3358, + "step": 498 + }, + { + "epoch": 3.4845585036972597, + "grad_norm": 0.12349941669465997, + "learning_rate": 2.0282743855644727e-05, + "loss": 0.3297, + "step": 499 + }, + { + "epoch": 3.491518051326664, + "grad_norm": 0.10080665328455962, + "learning_rate": 2.011293944846222e-05, + "loss": 0.3322, + "step": 500 + }, + { + "epoch": 3.4984775989560677, + "grad_norm": 0.12452596754190881, + "learning_rate": 1.994360977221364e-05, + "loss": 0.3378, + "step": 501 + }, + { + "epoch": 3.505437146585472, + "grad_norm": 0.10178078398456272, + "learning_rate": 1.97747588690265e-05, + "loss": 0.3254, + "step": 502 + }, + { + "epoch": 3.512396694214876, + "grad_norm": 0.11961132752342465, + "learning_rate": 1.9606390769599426e-05, + "loss": 0.3325, + "step": 503 + }, + { + "epoch": 3.51935624184428, + "grad_norm": 0.10380768215954524, + "learning_rate": 1.9438509493105816e-05, + "loss": 0.3301, + "step": 504 + }, + { + "epoch": 3.526315789473684, + "grad_norm": 0.10192138810793511, + "learning_rate": 1.9271119047097967e-05, + "loss": 0.3343, + "step": 505 + }, + { + "epoch": 3.5332753371030883, + "grad_norm": 0.11375055774791937, + "learning_rate": 1.910422342741136e-05, + "loss": 0.3354, + "step": 506 + }, + { + "epoch": 3.5402348847324925, + "grad_norm": 0.09713201709606016, + "learning_rate": 1.8937826618069396e-05, + "loss": 0.3326, + "step": 507 + }, + { + "epoch": 3.5471944323618967, + "grad_norm": 0.11482174079205225, + "learning_rate": 1.8771932591188106e-05, + "loss": 0.3358, + "step": 508 + }, + { + "epoch": 3.5541539799913004, + "grad_norm": 0.1085601568655917, + "learning_rate": 1.860654530688147e-05, + "loss": 0.3316, + "step": 509 + }, + { + "epoch": 3.5611135276207047, + "grad_norm": 0.11763399705418416, + "learning_rate": 1.84416687131668e-05, + "loss": 0.3296, + "step": 510 + }, + { + "epoch": 3.568073075250109, + "grad_norm": 0.11136798232627028, + "learning_rate": 1.8277306745870605e-05, + "loss": 0.3328, + "step": 511 + }, + { + "epoch": 3.5750326228795126, + "grad_norm": 0.12672402635855642, + "learning_rate": 1.811346332853451e-05, + "loss": 0.332, + "step": 512 + }, + { + "epoch": 3.581992170508917, + "grad_norm": 0.11623196365691729, + "learning_rate": 1.7950142372321658e-05, + "loss": 0.332, + "step": 513 + }, + { + "epoch": 3.588951718138321, + "grad_norm": 0.107315476806856, + "learning_rate": 1.778734777592337e-05, + "loss": 0.3317, + "step": 514 + }, + { + "epoch": 3.5959112657677252, + "grad_norm": 0.11801835716954666, + "learning_rate": 1.7625083425466044e-05, + "loss": 0.3339, + "step": 515 + }, + { + "epoch": 3.6028708133971294, + "grad_norm": 0.0966733318424344, + "learning_rate": 1.746335319441838e-05, + "loss": 0.3254, + "step": 516 + }, + { + "epoch": 3.609830361026533, + "grad_norm": 0.11771321695825486, + "learning_rate": 1.7302160943498916e-05, + "loss": 0.3354, + "step": 517 + }, + { + "epoch": 3.6167899086559374, + "grad_norm": 0.10460865568553186, + "learning_rate": 1.7141510520583887e-05, + "loss": 0.3305, + "step": 518 + }, + { + "epoch": 3.6237494562853416, + "grad_norm": 0.11996000392740622, + "learning_rate": 1.698140576061538e-05, + "loss": 0.339, + "step": 519 + }, + { + "epoch": 3.6307090039147454, + "grad_norm": 0.09741048167997303, + "learning_rate": 1.6821850485509784e-05, + "loss": 0.3366, + "step": 520 + }, + { + "epoch": 3.6376685515441496, + "grad_norm": 0.1142703570198178, + "learning_rate": 1.6662848504066502e-05, + "loss": 0.3337, + "step": 521 + }, + { + "epoch": 3.644628099173554, + "grad_norm": 0.09709378850935774, + "learning_rate": 1.6504403611877098e-05, + "loss": 0.3322, + "step": 522 + }, + { + "epoch": 3.651587646802958, + "grad_norm": 0.10615913057072311, + "learning_rate": 1.6346519591234637e-05, + "loss": 0.3325, + "step": 523 + }, + { + "epoch": 3.6585471944323618, + "grad_norm": 0.1156860445622765, + "learning_rate": 1.6189200211043484e-05, + "loss": 0.3347, + "step": 524 + }, + { + "epoch": 3.665506742061766, + "grad_norm": 0.10972351654483692, + "learning_rate": 1.6032449226729195e-05, + "loss": 0.3354, + "step": 525 + }, + { + "epoch": 3.67246628969117, + "grad_norm": 0.12167997828635133, + "learning_rate": 1.5876270380149038e-05, + "loss": 0.3371, + "step": 526 + }, + { + "epoch": 3.679425837320574, + "grad_norm": 0.1227635833803301, + "learning_rate": 1.57206673995025e-05, + "loss": 0.3303, + "step": 527 + }, + { + "epoch": 3.686385384949978, + "grad_norm": 0.10671146343573032, + "learning_rate": 1.556564399924244e-05, + "loss": 0.3301, + "step": 528 + }, + { + "epoch": 3.6933449325793823, + "grad_norm": 0.11109924997469728, + "learning_rate": 1.541120387998631e-05, + "loss": 0.3295, + "step": 529 + }, + { + "epoch": 3.7003044802087866, + "grad_norm": 0.10312763393789623, + "learning_rate": 1.5257350728427862e-05, + "loss": 0.3361, + "step": 530 + }, + { + "epoch": 3.7072640278381908, + "grad_norm": 0.10629651651459636, + "learning_rate": 1.5104088217249132e-05, + "loss": 0.3321, + "step": 531 + }, + { + "epoch": 3.7142235754675945, + "grad_norm": 0.09725532964037208, + "learning_rate": 1.4951420005032828e-05, + "loss": 0.3379, + "step": 532 + }, + { + "epoch": 3.7211831230969987, + "grad_norm": 0.11511675877236498, + "learning_rate": 1.4799349736174891e-05, + "loss": 0.3307, + "step": 533 + }, + { + "epoch": 3.728142670726403, + "grad_norm": 0.09824052462354282, + "learning_rate": 1.4647881040797547e-05, + "loss": 0.3273, + "step": 534 + }, + { + "epoch": 3.7351022183558067, + "grad_norm": 0.10324613912974012, + "learning_rate": 1.4497017534662651e-05, + "loss": 0.3344, + "step": 535 + }, + { + "epoch": 3.742061765985211, + "grad_norm": 0.10581522915369096, + "learning_rate": 1.4346762819085424e-05, + "loss": 0.3342, + "step": 536 + }, + { + "epoch": 3.749021313614615, + "grad_norm": 0.10101793752872175, + "learning_rate": 1.4197120480848381e-05, + "loss": 0.3348, + "step": 537 + }, + { + "epoch": 3.7559808612440193, + "grad_norm": 0.10390774039864187, + "learning_rate": 1.4048094092115774e-05, + "loss": 0.3301, + "step": 538 + }, + { + "epoch": 3.762940408873423, + "grad_norm": 0.10773321694134558, + "learning_rate": 1.389968721034833e-05, + "loss": 0.3353, + "step": 539 + }, + { + "epoch": 3.7698999565028273, + "grad_norm": 0.09221835790298968, + "learning_rate": 1.3751903378218315e-05, + "loss": 0.3329, + "step": 540 + }, + { + "epoch": 3.7768595041322315, + "grad_norm": 0.10820822350883358, + "learning_rate": 1.3604746123524932e-05, + "loss": 0.3278, + "step": 541 + }, + { + "epoch": 3.7838190517616352, + "grad_norm": 0.10470082245347596, + "learning_rate": 1.3458218959110152e-05, + "loss": 0.3371, + "step": 542 + }, + { + "epoch": 3.7907785993910394, + "grad_norm": 0.10207051909220886, + "learning_rate": 1.3312325382774827e-05, + "loss": 0.3371, + "step": 543 + }, + { + "epoch": 3.7977381470204437, + "grad_norm": 0.10101453592738402, + "learning_rate": 1.3167068877195237e-05, + "loss": 0.3265, + "step": 544 + }, + { + "epoch": 3.804697694649848, + "grad_norm": 0.09285174719990112, + "learning_rate": 1.3022452909839918e-05, + "loss": 0.3277, + "step": 545 + }, + { + "epoch": 3.811657242279252, + "grad_norm": 0.1059908590081114, + "learning_rate": 1.2878480932886874e-05, + "loss": 0.334, + "step": 546 + }, + { + "epoch": 3.818616789908656, + "grad_norm": 0.10635953000158242, + "learning_rate": 1.2735156383141187e-05, + "loss": 0.3325, + "step": 547 + }, + { + "epoch": 3.82557633753806, + "grad_norm": 0.0985299908106167, + "learning_rate": 1.2592482681953025e-05, + "loss": 0.3317, + "step": 548 + }, + { + "epoch": 3.8325358851674642, + "grad_norm": 0.10588236607635328, + "learning_rate": 1.2450463235135874e-05, + "loss": 0.34, + "step": 549 + }, + { + "epoch": 3.839495432796868, + "grad_norm": 0.10316866804543555, + "learning_rate": 1.2309101432885302e-05, + "loss": 0.3347, + "step": 550 + }, + { + "epoch": 3.846454980426272, + "grad_norm": 0.09961037740811082, + "learning_rate": 1.2168400649698039e-05, + "loss": 0.3351, + "step": 551 + }, + { + "epoch": 3.8534145280556764, + "grad_norm": 0.09817204131281733, + "learning_rate": 1.202836424429135e-05, + "loss": 0.3365, + "step": 552 + }, + { + "epoch": 3.8603740756850806, + "grad_norm": 0.09481914220388948, + "learning_rate": 1.1888995559522974e-05, + "loss": 0.3292, + "step": 553 + }, + { + "epoch": 3.867333623314485, + "grad_norm": 0.10082172771379438, + "learning_rate": 1.1750297922311193e-05, + "loss": 0.3335, + "step": 554 + }, + { + "epoch": 3.8742931709438886, + "grad_norm": 0.08986399832050056, + "learning_rate": 1.1612274643555504e-05, + "loss": 0.3284, + "step": 555 + }, + { + "epoch": 3.881252718573293, + "grad_norm": 0.08986088415045988, + "learning_rate": 1.1474929018057574e-05, + "loss": 0.3345, + "step": 556 + }, + { + "epoch": 3.8882122662026966, + "grad_norm": 0.09403102777418895, + "learning_rate": 1.1338264324442573e-05, + "loss": 0.3315, + "step": 557 + }, + { + "epoch": 3.8951718138321008, + "grad_norm": 0.09262927753397106, + "learning_rate": 1.1202283825080884e-05, + "loss": 0.3282, + "step": 558 + }, + { + "epoch": 3.902131361461505, + "grad_norm": 0.0934685977047065, + "learning_rate": 1.1066990766010274e-05, + "loss": 0.3337, + "step": 559 + }, + { + "epoch": 3.909090909090909, + "grad_norm": 0.09165216810312578, + "learning_rate": 1.093238837685835e-05, + "loss": 0.3318, + "step": 560 + }, + { + "epoch": 3.9160504567203134, + "grad_norm": 0.09756546339699468, + "learning_rate": 1.0798479870765558e-05, + "loss": 0.3282, + "step": 561 + }, + { + "epoch": 3.923010004349717, + "grad_norm": 0.09062420344108967, + "learning_rate": 1.0665268444308366e-05, + "loss": 0.3305, + "step": 562 + }, + { + "epoch": 3.9299695519791213, + "grad_norm": 0.09673675762215994, + "learning_rate": 1.0532757277423019e-05, + "loss": 0.3291, + "step": 563 + }, + { + "epoch": 3.9369290996085256, + "grad_norm": 0.0933424618907197, + "learning_rate": 1.0400949533329653e-05, + "loss": 0.3414, + "step": 564 + }, + { + "epoch": 3.9438886472379293, + "grad_norm": 0.08961372593224715, + "learning_rate": 1.0269848358456743e-05, + "loss": 0.3262, + "step": 565 + }, + { + "epoch": 3.9508481948673335, + "grad_norm": 0.1180626958636412, + "learning_rate": 1.0139456882365981e-05, + "loss": 0.3379, + "step": 566 + }, + { + "epoch": 3.9578077424967377, + "grad_norm": 0.08706924109298092, + "learning_rate": 1.0009778217677617e-05, + "loss": 0.3356, + "step": 567 + }, + { + "epoch": 3.964767290126142, + "grad_norm": 0.09192179644041273, + "learning_rate": 9.880815459996102e-06, + "loss": 0.3353, + "step": 568 + }, + { + "epoch": 3.971726837755546, + "grad_norm": 0.08744584106099203, + "learning_rate": 9.752571687836267e-06, + "loss": 0.3275, + "step": 569 + }, + { + "epoch": 3.97868638538495, + "grad_norm": 0.09171104289414916, + "learning_rate": 9.625049962549768e-06, + "loss": 0.3334, + "step": 570 + }, + { + "epoch": 3.985645933014354, + "grad_norm": 0.09202503387494353, + "learning_rate": 9.498253328252023e-06, + "loss": 0.3311, + "step": 571 + }, + { + "epoch": 3.9926054806437583, + "grad_norm": 0.09047931089059162, + "learning_rate": 9.372184811749544e-06, + "loss": 0.3316, + "step": 572 + }, + { + "epoch": 4.003479773814702, + "grad_norm": 0.11221100584378, + "learning_rate": 9.246847422467718e-06, + "loss": 0.3252, + "step": 573 + }, + { + "epoch": 4.010439321444106, + "grad_norm": 0.14511168880726213, + "learning_rate": 9.122244152378919e-06, + "loss": 0.3121, + "step": 574 + }, + { + "epoch": 4.01739886907351, + "grad_norm": 0.11527962568493744, + "learning_rate": 8.998377975931096e-06, + "loss": 0.3038, + "step": 575 + }, + { + "epoch": 4.024358416702914, + "grad_norm": 0.10476073099305397, + "learning_rate": 8.875251849976823e-06, + "loss": 0.3086, + "step": 576 + }, + { + "epoch": 4.0313179643323185, + "grad_norm": 0.12889927756242409, + "learning_rate": 8.752868713702617e-06, + "loss": 0.3109, + "step": 577 + }, + { + "epoch": 4.038277511961723, + "grad_norm": 0.1290059820672822, + "learning_rate": 8.63123148855888e-06, + "loss": 0.3054, + "step": 578 + }, + { + "epoch": 4.045237059591127, + "grad_norm": 0.12434412735412093, + "learning_rate": 8.510343078190075e-06, + "loss": 0.3147, + "step": 579 + }, + { + "epoch": 4.052196607220531, + "grad_norm": 0.10748206301685143, + "learning_rate": 8.39020636836545e-06, + "loss": 0.3075, + "step": 580 + }, + { + "epoch": 4.059156154849934, + "grad_norm": 0.10972499765580897, + "learning_rate": 8.270824226910163e-06, + "loss": 0.3078, + "step": 581 + }, + { + "epoch": 4.066115702479339, + "grad_norm": 0.11672965349240796, + "learning_rate": 8.152199503636819e-06, + "loss": 0.3108, + "step": 582 + }, + { + "epoch": 4.073075250108743, + "grad_norm": 0.11513790902472179, + "learning_rate": 8.034335030277406e-06, + "loss": 0.3034, + "step": 583 + }, + { + "epoch": 4.080034797738147, + "grad_norm": 0.10714140673258672, + "learning_rate": 7.917233620415716e-06, + "loss": 0.3101, + "step": 584 + }, + { + "epoch": 4.086994345367551, + "grad_norm": 0.10443849628188653, + "learning_rate": 7.800898069420203e-06, + "loss": 0.3119, + "step": 585 + }, + { + "epoch": 4.0939538929969554, + "grad_norm": 0.10270537031965189, + "learning_rate": 7.685331154377254e-06, + "loss": 0.3108, + "step": 586 + }, + { + "epoch": 4.10091344062636, + "grad_norm": 0.10002934787588738, + "learning_rate": 7.570535634024847e-06, + "loss": 0.3116, + "step": 587 + }, + { + "epoch": 4.107872988255763, + "grad_norm": 0.1020168199335099, + "learning_rate": 7.456514248686737e-06, + "loss": 0.313, + "step": 588 + }, + { + "epoch": 4.114832535885167, + "grad_norm": 0.09475662900974856, + "learning_rate": 7.343269720207051e-06, + "loss": 0.3187, + "step": 589 + }, + { + "epoch": 4.121792083514571, + "grad_norm": 0.09367174029086878, + "learning_rate": 7.2308047518852895e-06, + "loss": 0.3054, + "step": 590 + }, + { + "epoch": 4.128751631143976, + "grad_norm": 0.09991070813606238, + "learning_rate": 7.119122028411798e-06, + "loss": 0.3094, + "step": 591 + }, + { + "epoch": 4.13571117877338, + "grad_norm": 0.0971805873557556, + "learning_rate": 7.008224215803672e-06, + "loss": 0.3149, + "step": 592 + }, + { + "epoch": 4.142670726402784, + "grad_norm": 0.0921029731957292, + "learning_rate": 6.898113961341128e-06, + "loss": 0.3101, + "step": 593 + }, + { + "epoch": 4.149630274032188, + "grad_norm": 0.09523088486551998, + "learning_rate": 6.788793893504335e-06, + "loss": 0.3052, + "step": 594 + }, + { + "epoch": 4.156589821661592, + "grad_norm": 0.09591717117697106, + "learning_rate": 6.680266621910632e-06, + "loss": 0.3096, + "step": 595 + }, + { + "epoch": 4.163549369290996, + "grad_norm": 0.09084622178996993, + "learning_rate": 6.5725347372522204e-06, + "loss": 0.3137, + "step": 596 + }, + { + "epoch": 4.1705089169204, + "grad_norm": 0.08687253646904729, + "learning_rate": 6.465600811234356e-06, + "loss": 0.3108, + "step": 597 + }, + { + "epoch": 4.177468464549804, + "grad_norm": 0.09181114682155883, + "learning_rate": 6.3594673965139675e-06, + "loss": 0.3079, + "step": 598 + }, + { + "epoch": 4.184428012179208, + "grad_norm": 0.09294790321001072, + "learning_rate": 6.254137026638676e-06, + "loss": 0.3063, + "step": 599 + }, + { + "epoch": 4.1913875598086126, + "grad_norm": 0.08710789978827478, + "learning_rate": 6.149612215986334e-06, + "loss": 0.3067, + "step": 600 + }, + { + "epoch": 4.198347107438017, + "grad_norm": 0.08713529476536094, + "learning_rate": 6.045895459705042e-06, + "loss": 0.3106, + "step": 601 + }, + { + "epoch": 4.205306655067421, + "grad_norm": 0.09233253347150959, + "learning_rate": 5.94298923365352e-06, + "loss": 0.3075, + "step": 602 + }, + { + "epoch": 4.212266202696824, + "grad_norm": 0.08771080578385054, + "learning_rate": 5.840895994342068e-06, + "loss": 0.3115, + "step": 603 + }, + { + "epoch": 4.2192257503262285, + "grad_norm": 0.08701575520285076, + "learning_rate": 5.7396181788738735e-06, + "loss": 0.3115, + "step": 604 + }, + { + "epoch": 4.226185297955633, + "grad_norm": 0.08851880459624953, + "learning_rate": 5.639158204886861e-06, + "loss": 0.3135, + "step": 605 + }, + { + "epoch": 4.233144845585037, + "grad_norm": 0.09269071200856169, + "learning_rate": 5.539518470495991e-06, + "loss": 0.3122, + "step": 606 + }, + { + "epoch": 4.240104393214441, + "grad_norm": 0.08855364356761067, + "learning_rate": 5.440701354235995e-06, + "loss": 0.3064, + "step": 607 + }, + { + "epoch": 4.247063940843845, + "grad_norm": 0.08628014488530013, + "learning_rate": 5.3427092150045975e-06, + "loss": 0.3075, + "step": 608 + }, + { + "epoch": 4.2540234884732495, + "grad_norm": 0.09140056125384614, + "learning_rate": 5.24554439200621e-06, + "loss": 0.3094, + "step": 609 + }, + { + "epoch": 4.260983036102654, + "grad_norm": 0.09432054790072364, + "learning_rate": 5.149209204696073e-06, + "loss": 0.3129, + "step": 610 + }, + { + "epoch": 4.267942583732057, + "grad_norm": 0.08230546321550815, + "learning_rate": 5.05370595272495e-06, + "loss": 0.3129, + "step": 611 + }, + { + "epoch": 4.274902131361461, + "grad_norm": 0.08826471384141378, + "learning_rate": 4.959036915884134e-06, + "loss": 0.3176, + "step": 612 + }, + { + "epoch": 4.2818616789908654, + "grad_norm": 0.08473384022161361, + "learning_rate": 4.865204354051129e-06, + "loss": 0.3031, + "step": 613 + }, + { + "epoch": 4.28882122662027, + "grad_norm": 0.09157501033682776, + "learning_rate": 4.7722105071356065e-06, + "loss": 0.3083, + "step": 614 + }, + { + "epoch": 4.295780774249674, + "grad_norm": 0.08590767447738862, + "learning_rate": 4.68005759502602e-06, + "loss": 0.3089, + "step": 615 + }, + { + "epoch": 4.302740321879078, + "grad_norm": 0.0828024048843273, + "learning_rate": 4.588747817536563e-06, + "loss": 0.3157, + "step": 616 + }, + { + "epoch": 4.309699869508482, + "grad_norm": 0.08180623225652858, + "learning_rate": 4.498283354354654e-06, + "loss": 0.3049, + "step": 617 + }, + { + "epoch": 4.3166594171378865, + "grad_norm": 0.08341811378979368, + "learning_rate": 4.408666364988938e-06, + "loss": 0.3146, + "step": 618 + }, + { + "epoch": 4.32361896476729, + "grad_norm": 0.08072676966602048, + "learning_rate": 4.31989898871771e-06, + "loss": 0.3121, + "step": 619 + }, + { + "epoch": 4.330578512396694, + "grad_norm": 0.07811857676852015, + "learning_rate": 4.231983344537875e-06, + "loss": 0.3056, + "step": 620 + }, + { + "epoch": 4.337538060026098, + "grad_norm": 0.07963712695352229, + "learning_rate": 4.144921531114317e-06, + "loss": 0.3092, + "step": 621 + }, + { + "epoch": 4.344497607655502, + "grad_norm": 0.08106713806356201, + "learning_rate": 4.058715626729837e-06, + "loss": 0.3087, + "step": 622 + }, + { + "epoch": 4.351457155284907, + "grad_norm": 0.0825669290953596, + "learning_rate": 3.973367689235548e-06, + "loss": 0.3124, + "step": 623 + }, + { + "epoch": 4.358416702914311, + "grad_norm": 0.08346710243966363, + "learning_rate": 3.888879756001726e-06, + "loss": 0.3097, + "step": 624 + }, + { + "epoch": 4.365376250543715, + "grad_norm": 0.07952645915127651, + "learning_rate": 3.805253843869179e-06, + "loss": 0.3082, + "step": 625 + }, + { + "epoch": 4.372335798173118, + "grad_norm": 0.07932372481956639, + "learning_rate": 3.72249194910113e-06, + "loss": 0.3172, + "step": 626 + }, + { + "epoch": 4.3792953458025226, + "grad_norm": 0.07754167774238424, + "learning_rate": 3.6405960473355183e-06, + "loss": 0.3082, + "step": 627 + }, + { + "epoch": 4.386254893431927, + "grad_norm": 0.08136667422744384, + "learning_rate": 3.5595680935378972e-06, + "loss": 0.3098, + "step": 628 + }, + { + "epoch": 4.393214441061331, + "grad_norm": 0.08485744461035595, + "learning_rate": 3.4794100219546967e-06, + "loss": 0.3132, + "step": 629 + }, + { + "epoch": 4.400173988690735, + "grad_norm": 0.07736893733783809, + "learning_rate": 3.400123746067099e-06, + "loss": 0.3057, + "step": 630 + }, + { + "epoch": 4.407133536320139, + "grad_norm": 0.07691011304383329, + "learning_rate": 3.321711158545351e-06, + "loss": 0.3092, + "step": 631 + }, + { + "epoch": 4.414093083949544, + "grad_norm": 0.07876827881942446, + "learning_rate": 3.2441741312036014e-06, + "loss": 0.309, + "step": 632 + }, + { + "epoch": 4.421052631578947, + "grad_norm": 0.08089462204978613, + "learning_rate": 3.167514514955157e-06, + "loss": 0.3105, + "step": 633 + }, + { + "epoch": 4.428012179208351, + "grad_norm": 0.07721348081474763, + "learning_rate": 3.0917341397683633e-06, + "loss": 0.3071, + "step": 634 + }, + { + "epoch": 4.434971726837755, + "grad_norm": 0.07932348813099326, + "learning_rate": 3.0168348146228842e-06, + "loss": 0.3099, + "step": 635 + }, + { + "epoch": 4.4419312744671595, + "grad_norm": 0.08123974569538568, + "learning_rate": 2.942818327466559e-06, + "loss": 0.3102, + "step": 636 + }, + { + "epoch": 4.448890822096564, + "grad_norm": 0.0805284681969405, + "learning_rate": 2.8696864451726614e-06, + "loss": 0.3167, + "step": 637 + }, + { + "epoch": 4.455850369725968, + "grad_norm": 0.08047835245862313, + "learning_rate": 2.79744091349778e-06, + "loss": 0.3076, + "step": 638 + }, + { + "epoch": 4.462809917355372, + "grad_norm": 0.08129909351756673, + "learning_rate": 2.7260834570400986e-06, + "loss": 0.3124, + "step": 639 + }, + { + "epoch": 4.469769464984776, + "grad_norm": 0.07839675778245354, + "learning_rate": 2.6556157791982707e-06, + "loss": 0.3079, + "step": 640 + }, + { + "epoch": 4.4767290126141805, + "grad_norm": 0.08062781084505634, + "learning_rate": 2.586039562130722e-06, + "loss": 0.3047, + "step": 641 + }, + { + "epoch": 4.483688560243584, + "grad_norm": 0.0789139952040182, + "learning_rate": 2.5173564667155015e-06, + "loss": 0.3117, + "step": 642 + }, + { + "epoch": 4.490648107872988, + "grad_norm": 0.07782883382645976, + "learning_rate": 2.4495681325106535e-06, + "loss": 0.3086, + "step": 643 + }, + { + "epoch": 4.497607655502392, + "grad_norm": 0.07695405845578089, + "learning_rate": 2.3826761777150643e-06, + "loss": 0.3075, + "step": 644 + }, + { + "epoch": 4.5045672031317965, + "grad_norm": 0.0776536074585525, + "learning_rate": 2.3166821991298384e-06, + "loss": 0.3116, + "step": 645 + }, + { + "epoch": 4.511526750761201, + "grad_norm": 0.07992836130626074, + "learning_rate": 2.2515877721201697e-06, + "loss": 0.313, + "step": 646 + }, + { + "epoch": 4.518486298390605, + "grad_norm": 0.07576525240054546, + "learning_rate": 2.1873944505777447e-06, + "loss": 0.3097, + "step": 647 + }, + { + "epoch": 4.525445846020009, + "grad_norm": 0.08056446914044152, + "learning_rate": 2.124103766883661e-06, + "loss": 0.3093, + "step": 648 + }, + { + "epoch": 4.532405393649412, + "grad_norm": 0.07912041219537566, + "learning_rate": 2.0617172318718205e-06, + "loss": 0.3109, + "step": 649 + }, + { + "epoch": 4.539364941278817, + "grad_norm": 0.0748575568058156, + "learning_rate": 2.000236334792871e-06, + "loss": 0.306, + "step": 650 + }, + { + "epoch": 4.546324488908221, + "grad_norm": 0.07495506939333019, + "learning_rate": 1.9396625432786866e-06, + "loss": 0.308, + "step": 651 + }, + { + "epoch": 4.553284036537625, + "grad_norm": 0.07516098899508962, + "learning_rate": 1.879997303307297e-06, + "loss": 0.3132, + "step": 652 + }, + { + "epoch": 4.560243584167029, + "grad_norm": 0.0760900687853293, + "learning_rate": 1.8212420391683761e-06, + "loss": 0.312, + "step": 653 + }, + { + "epoch": 4.5672031317964334, + "grad_norm": 0.07405500093323297, + "learning_rate": 1.7633981534292565e-06, + "loss": 0.3101, + "step": 654 + }, + { + "epoch": 4.574162679425838, + "grad_norm": 0.07510600135021905, + "learning_rate": 1.7064670269014306e-06, + "loss": 0.3065, + "step": 655 + }, + { + "epoch": 4.581122227055241, + "grad_norm": 0.07279725597347841, + "learning_rate": 1.65045001860761e-06, + "loss": 0.3083, + "step": 656 + }, + { + "epoch": 4.588081774684645, + "grad_norm": 0.07713401251486195, + "learning_rate": 1.5953484657492734e-06, + "loss": 0.3129, + "step": 657 + }, + { + "epoch": 4.595041322314049, + "grad_norm": 0.07573166128441056, + "learning_rate": 1.5411636836747357e-06, + "loss": 0.3111, + "step": 658 + }, + { + "epoch": 4.602000869943454, + "grad_norm": 0.07751898750624993, + "learning_rate": 1.4878969658477505e-06, + "loss": 0.3151, + "step": 659 + }, + { + "epoch": 4.608960417572858, + "grad_norm": 0.07530355421456504, + "learning_rate": 1.435549583816669e-06, + "loss": 0.3081, + "step": 660 + }, + { + "epoch": 4.615919965202262, + "grad_norm": 0.07642926242619338, + "learning_rate": 1.3841227871840278e-06, + "loss": 0.3133, + "step": 661 + }, + { + "epoch": 4.622879512831666, + "grad_norm": 0.07705536264479143, + "learning_rate": 1.3336178035767612e-06, + "loss": 0.3094, + "step": 662 + }, + { + "epoch": 4.62983906046107, + "grad_norm": 0.07420116640392391, + "learning_rate": 1.2840358386168972e-06, + "loss": 0.3038, + "step": 663 + }, + { + "epoch": 4.636798608090475, + "grad_norm": 0.07614616846884953, + "learning_rate": 1.2353780758927347e-06, + "loss": 0.311, + "step": 664 + }, + { + "epoch": 4.643758155719878, + "grad_norm": 0.08312089631794453, + "learning_rate": 1.1876456769306554e-06, + "loss": 0.3124, + "step": 665 + }, + { + "epoch": 4.650717703349282, + "grad_norm": 0.07487938485965528, + "learning_rate": 1.1408397811673376e-06, + "loss": 0.3105, + "step": 666 + }, + { + "epoch": 4.657677250978686, + "grad_norm": 0.07301485090244404, + "learning_rate": 1.0949615059225871e-06, + "loss": 0.3039, + "step": 667 + }, + { + "epoch": 4.6646367986080906, + "grad_norm": 0.07618631980688859, + "learning_rate": 1.0500119463726467e-06, + "loss": 0.3147, + "step": 668 + }, + { + "epoch": 4.671596346237495, + "grad_norm": 0.07505804558510307, + "learning_rate": 1.0059921755240797e-06, + "loss": 0.3114, + "step": 669 + }, + { + "epoch": 4.678555893866899, + "grad_norm": 0.07677226952718405, + "learning_rate": 9.62903244188147e-07, + "loss": 0.312, + "step": 670 + }, + { + "epoch": 4.685515441496303, + "grad_norm": 0.07748382882066837, + "learning_rate": 9.207461809556872e-07, + "loss": 0.3115, + "step": 671 + }, + { + "epoch": 4.6924749891257065, + "grad_norm": 0.07605829792945587, + "learning_rate": 8.795219921726139e-07, + "loss": 0.3122, + "step": 672 + }, + { + "epoch": 4.699434536755111, + "grad_norm": 0.07387581425092563, + "learning_rate": 8.392316619158669e-07, + "loss": 0.3074, + "step": 673 + }, + { + "epoch": 4.706394084384515, + "grad_norm": 0.07420555319544671, + "learning_rate": 7.998761519699205e-07, + "loss": 0.3107, + "step": 674 + }, + { + "epoch": 4.713353632013919, + "grad_norm": 0.07466571827662617, + "learning_rate": 7.61456401803824e-07, + "loss": 0.314, + "step": 675 + }, + { + "epoch": 4.720313179643323, + "grad_norm": 0.0731954788870046, + "learning_rate": 7.239733285487882e-07, + "loss": 0.3053, + "step": 676 + }, + { + "epoch": 4.7272727272727275, + "grad_norm": 0.07374103750191568, + "learning_rate": 6.874278269762924e-07, + "loss": 0.3098, + "step": 677 + }, + { + "epoch": 4.734232274902132, + "grad_norm": 0.07181151311493585, + "learning_rate": 6.518207694766965e-07, + "loss": 0.3111, + "step": 678 + }, + { + "epoch": 4.741191822531535, + "grad_norm": 0.0731098529031706, + "learning_rate": 6.171530060384445e-07, + "loss": 0.3057, + "step": 679 + }, + { + "epoch": 4.748151370160939, + "grad_norm": 0.07345303292289396, + "learning_rate": 5.834253642277655e-07, + "loss": 0.3085, + "step": 680 + }, + { + "epoch": 4.7551109177903434, + "grad_norm": 0.07333048475918498, + "learning_rate": 5.506386491689197e-07, + "loss": 0.307, + "step": 681 + }, + { + "epoch": 4.762070465419748, + "grad_norm": 0.07552740911471753, + "learning_rate": 5.187936435249796e-07, + "loss": 0.3086, + "step": 682 + }, + { + "epoch": 4.769030013049152, + "grad_norm": 0.07273500457357926, + "learning_rate": 4.878911074791371e-07, + "loss": 0.3121, + "step": 683 + }, + { + "epoch": 4.775989560678556, + "grad_norm": 0.0738654801444934, + "learning_rate": 4.57931778716576e-07, + "loss": 0.3062, + "step": 684 + }, + { + "epoch": 4.78294910830796, + "grad_norm": 0.07194970207986003, + "learning_rate": 4.2891637240684234e-07, + "loss": 0.3119, + "step": 685 + }, + { + "epoch": 4.789908655937364, + "grad_norm": 0.0747422813518781, + "learning_rate": 4.0084558118678173e-07, + "loss": 0.3081, + "step": 686 + }, + { + "epoch": 4.796868203566768, + "grad_norm": 0.07338451912151427, + "learning_rate": 3.7372007514401063e-07, + "loss": 0.3114, + "step": 687 + }, + { + "epoch": 4.803827751196172, + "grad_norm": 0.07345199197033869, + "learning_rate": 3.4754050180090704e-07, + "loss": 0.3016, + "step": 688 + }, + { + "epoch": 4.810787298825576, + "grad_norm": 0.07227607987729133, + "learning_rate": 3.223074860991693e-07, + "loss": 0.3113, + "step": 689 + }, + { + "epoch": 4.81774684645498, + "grad_norm": 0.07278601000745541, + "learning_rate": 2.980216303848815e-07, + "loss": 0.3082, + "step": 690 + }, + { + "epoch": 4.824706394084385, + "grad_norm": 0.07611641795774764, + "learning_rate": 2.746835143941473e-07, + "loss": 0.3076, + "step": 691 + }, + { + "epoch": 4.831665941713789, + "grad_norm": 0.07190187291197621, + "learning_rate": 2.5229369523923853e-07, + "loss": 0.3079, + "step": 692 + }, + { + "epoch": 4.838625489343193, + "grad_norm": 0.0739457194295076, + "learning_rate": 2.3085270739531706e-07, + "loss": 0.3095, + "step": 693 + }, + { + "epoch": 4.845585036972597, + "grad_norm": 0.07353977111327974, + "learning_rate": 2.1036106268765398e-07, + "loss": 0.3087, + "step": 694 + }, + { + "epoch": 4.8525445846020006, + "grad_norm": 0.07274021557955242, + "learning_rate": 1.908192502794215e-07, + "loss": 0.3104, + "step": 695 + }, + { + "epoch": 4.859504132231405, + "grad_norm": 0.07265017310099392, + "learning_rate": 1.7222773666001336e-07, + "loss": 0.3113, + "step": 696 + }, + { + "epoch": 4.866463679860809, + "grad_norm": 0.0713230748017762, + "learning_rate": 1.545869656339072e-07, + "loss": 0.3086, + "step": 697 + }, + { + "epoch": 4.873423227490213, + "grad_norm": 0.07442042787119083, + "learning_rate": 1.3789735831009064e-07, + "loss": 0.3124, + "step": 698 + }, + { + "epoch": 4.880382775119617, + "grad_norm": 0.0737173934874928, + "learning_rate": 1.2215931309197626e-07, + "loss": 0.3092, + "step": 699 + }, + { + "epoch": 4.887342322749022, + "grad_norm": 0.07197265372516874, + "learning_rate": 1.0737320566790221e-07, + "loss": 0.311, + "step": 700 + }, + { + "epoch": 4.894301870378426, + "grad_norm": 0.07270890000366599, + "learning_rate": 9.35393890021885e-08, + "loss": 0.3135, + "step": 701 + }, + { + "epoch": 4.901261418007829, + "grad_norm": 0.07164493485234402, + "learning_rate": 8.065819332667702e-08, + "loss": 0.3063, + "step": 702 + }, + { + "epoch": 4.908220965637233, + "grad_norm": 0.07345553241814261, + "learning_rate": 6.872992613286223e-08, + "loss": 0.311, + "step": 703 + }, + { + "epoch": 4.9151805132666375, + "grad_norm": 0.0738265738394693, + "learning_rate": 5.775487216456377e-08, + "loss": 0.3123, + "step": 704 + }, + { + "epoch": 4.922140060896042, + "grad_norm": 0.07288034052473, + "learning_rate": 4.7733293411105216e-08, + "loss": 0.3155, + "step": 705 + }, + { + "epoch": 4.929099608525446, + "grad_norm": 0.0763072402074743, + "learning_rate": 3.8665429101070185e-08, + "loss": 0.3044, + "step": 706 + }, + { + "epoch": 4.93605915615485, + "grad_norm": 0.07435037595475792, + "learning_rate": 3.055149569660909e-08, + "loss": 0.3121, + "step": 707 + }, + { + "epoch": 4.943018703784254, + "grad_norm": 0.07118427621968816, + "learning_rate": 2.3391686888238894e-08, + "loss": 0.3123, + "step": 708 + }, + { + "epoch": 4.949978251413658, + "grad_norm": 0.07221686597134709, + "learning_rate": 1.7186173590251208e-08, + "loss": 0.3037, + "step": 709 + }, + { + "epoch": 4.956937799043062, + "grad_norm": 0.07211936196974679, + "learning_rate": 1.1935103936600023e-08, + "loss": 0.3109, + "step": 710 + }, + { + "epoch": 4.963897346672466, + "grad_norm": 0.07300879682236437, + "learning_rate": 7.63860327740229e-09, + "loss": 0.315, + "step": 711 + }, + { + "epoch": 4.97085689430187, + "grad_norm": 0.0746678008435265, + "learning_rate": 4.296774175918117e-09, + "loss": 0.3114, + "step": 712 + }, + { + "epoch": 4.9778164419312745, + "grad_norm": 0.07379213535477022, + "learning_rate": 1.909696406103834e-09, + "loss": 0.3132, + "step": 713 + }, + { + "epoch": 4.984775989560679, + "grad_norm": 0.072449627565001, + "learning_rate": 4.77426950733495e-10, + "loss": 0.3053, + "step": 714 + }, + { + "epoch": 4.991735537190083, + "grad_norm": 0.07321298642770968, + "learning_rate": 0.0, + "loss": 0.3097, + "step": 715 + }, + { + "epoch": 4.991735537190083, + "step": 715, + "total_flos": 1.839907874248065e+19, + "train_loss": 0.3850101018285418, + "train_runtime": 71630.1527, + "train_samples_per_second": 5.133, + "train_steps_per_second": 0.01 + } + ], + "logging_steps": 1, + "max_steps": 715, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.839907874248065e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}