|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.987654320987654, |
|
"eval_steps": 500, |
|
"global_step": 225, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02194787379972565, |
|
"grad_norm": 6.221677266588304, |
|
"learning_rate": 3.4782608695652175e-06, |
|
"loss": 0.8718, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0438957475994513, |
|
"grad_norm": 6.253264896473354, |
|
"learning_rate": 6.956521739130435e-06, |
|
"loss": 0.8766, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.06584362139917696, |
|
"grad_norm": 5.741674847301876, |
|
"learning_rate": 1.0434782608695653e-05, |
|
"loss": 0.8549, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0877914951989026, |
|
"grad_norm": 2.3877482149850215, |
|
"learning_rate": 1.391304347826087e-05, |
|
"loss": 0.7607, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.10973936899862825, |
|
"grad_norm": 4.563966782136464, |
|
"learning_rate": 1.739130434782609e-05, |
|
"loss": 0.7707, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.13168724279835392, |
|
"grad_norm": 5.736100045190483, |
|
"learning_rate": 2.0869565217391306e-05, |
|
"loss": 0.7229, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.15363511659807957, |
|
"grad_norm": 5.958244309479874, |
|
"learning_rate": 2.4347826086956526e-05, |
|
"loss": 0.7388, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.1755829903978052, |
|
"grad_norm": 2.9204724652960246, |
|
"learning_rate": 2.782608695652174e-05, |
|
"loss": 0.6832, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.19753086419753085, |
|
"grad_norm": 2.3986017638483466, |
|
"learning_rate": 3.130434782608696e-05, |
|
"loss": 0.649, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.2194787379972565, |
|
"grad_norm": 1.6178731632508974, |
|
"learning_rate": 3.478260869565218e-05, |
|
"loss": 0.6185, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.24142661179698216, |
|
"grad_norm": 1.6322225805622788, |
|
"learning_rate": 3.8260869565217395e-05, |
|
"loss": 0.6045, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.26337448559670784, |
|
"grad_norm": 1.538396522627224, |
|
"learning_rate": 4.173913043478261e-05, |
|
"loss": 0.5934, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.28532235939643347, |
|
"grad_norm": 1.765556579505549, |
|
"learning_rate": 4.521739130434783e-05, |
|
"loss": 0.6012, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.30727023319615915, |
|
"grad_norm": 1.9039962720526176, |
|
"learning_rate": 4.869565217391305e-05, |
|
"loss": 0.5794, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.3292181069958848, |
|
"grad_norm": 0.9685232076545514, |
|
"learning_rate": 5.217391304347826e-05, |
|
"loss": 0.5783, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.3511659807956104, |
|
"grad_norm": 2.3819790183508056, |
|
"learning_rate": 5.565217391304348e-05, |
|
"loss": 0.586, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.3731138545953361, |
|
"grad_norm": 1.4218859481860902, |
|
"learning_rate": 5.91304347826087e-05, |
|
"loss": 0.5584, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.3950617283950617, |
|
"grad_norm": 1.705949751999593, |
|
"learning_rate": 6.260869565217392e-05, |
|
"loss": 0.5785, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.4170096021947874, |
|
"grad_norm": 1.4612385852414467, |
|
"learning_rate": 6.608695652173914e-05, |
|
"loss": 0.562, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.438957475994513, |
|
"grad_norm": 0.9429650218903195, |
|
"learning_rate": 6.956521739130436e-05, |
|
"loss": 0.5586, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.4609053497942387, |
|
"grad_norm": 1.8504062874353713, |
|
"learning_rate": 7.304347826086957e-05, |
|
"loss": 0.5648, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.4828532235939643, |
|
"grad_norm": 1.0359118447076778, |
|
"learning_rate": 7.652173913043479e-05, |
|
"loss": 0.5448, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.50480109739369, |
|
"grad_norm": 1.6864037499836835, |
|
"learning_rate": 8e-05, |
|
"loss": 0.5592, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.5267489711934157, |
|
"grad_norm": 1.2434910951524751, |
|
"learning_rate": 7.999516253040599e-05, |
|
"loss": 0.5427, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.5486968449931413, |
|
"grad_norm": 1.3000636051596768, |
|
"learning_rate": 7.998065129167953e-05, |
|
"loss": 0.5373, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.5706447187928669, |
|
"grad_norm": 2.684317103873129, |
|
"learning_rate": 7.995646979370445e-05, |
|
"loss": 0.5399, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 1.4021363857201035, |
|
"learning_rate": 7.992262388534378e-05, |
|
"loss": 0.5434, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.6145404663923183, |
|
"grad_norm": 1.9180218278821586, |
|
"learning_rate": 7.987912175302518e-05, |
|
"loss": 0.5344, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.6364883401920439, |
|
"grad_norm": 1.152289451077819, |
|
"learning_rate": 7.982597391876076e-05, |
|
"loss": 0.531, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.6584362139917695, |
|
"grad_norm": 1.8585951370881602, |
|
"learning_rate": 7.976319323760211e-05, |
|
"loss": 0.5416, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.6803840877914952, |
|
"grad_norm": 1.3000646333049437, |
|
"learning_rate": 7.969079489453107e-05, |
|
"loss": 0.5263, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.7023319615912208, |
|
"grad_norm": 1.6442425191809935, |
|
"learning_rate": 7.960879640078679e-05, |
|
"loss": 0.5282, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.7242798353909465, |
|
"grad_norm": 1.3282337661300934, |
|
"learning_rate": 7.951721758963028e-05, |
|
"loss": 0.5125, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.7462277091906722, |
|
"grad_norm": 1.1249842886051156, |
|
"learning_rate": 7.941608061154726e-05, |
|
"loss": 0.5066, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.7681755829903978, |
|
"grad_norm": 0.9153543962795861, |
|
"learning_rate": 7.930540992889056e-05, |
|
"loss": 0.5041, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.7901234567901234, |
|
"grad_norm": 0.8714948501569295, |
|
"learning_rate": 7.918523230996327e-05, |
|
"loss": 0.5078, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.8120713305898491, |
|
"grad_norm": 1.17614468677297, |
|
"learning_rate": 7.905557682254429e-05, |
|
"loss": 0.5132, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.8340192043895748, |
|
"grad_norm": 0.9070928796685083, |
|
"learning_rate": 7.891647482685751e-05, |
|
"loss": 0.4954, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.8559670781893004, |
|
"grad_norm": 0.6348754426964612, |
|
"learning_rate": 7.876795996798665e-05, |
|
"loss": 0.5002, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.877914951989026, |
|
"grad_norm": 0.7240683284862476, |
|
"learning_rate": 7.861006816773743e-05, |
|
"loss": 0.4974, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.8998628257887518, |
|
"grad_norm": 0.6967632259977047, |
|
"learning_rate": 7.844283761594899e-05, |
|
"loss": 0.4916, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.9218106995884774, |
|
"grad_norm": 0.7523038917826653, |
|
"learning_rate": 7.82663087612568e-05, |
|
"loss": 0.4943, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.943758573388203, |
|
"grad_norm": 0.9009479018151323, |
|
"learning_rate": 7.80805243013092e-05, |
|
"loss": 0.4948, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.9657064471879286, |
|
"grad_norm": 1.1416188350869176, |
|
"learning_rate": 7.788552917244002e-05, |
|
"loss": 0.4963, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.9876543209876543, |
|
"grad_norm": 0.7724926247499572, |
|
"learning_rate": 7.768137053879957e-05, |
|
"loss": 0.4827, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.00960219478738, |
|
"grad_norm": 0.7378875298449712, |
|
"learning_rate": 7.7468097780947e-05, |
|
"loss": 0.4808, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.0315500685871055, |
|
"grad_norm": 0.8154669296425998, |
|
"learning_rate": 7.724576248390639e-05, |
|
"loss": 0.4702, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.0534979423868314, |
|
"grad_norm": 0.7479771108510165, |
|
"learning_rate": 7.701441842468968e-05, |
|
"loss": 0.457, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.075445816186557, |
|
"grad_norm": 0.8109214505078363, |
|
"learning_rate": 7.677412155928946e-05, |
|
"loss": 0.4581, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.0973936899862826, |
|
"grad_norm": 0.620878499140312, |
|
"learning_rate": 7.652493000914476e-05, |
|
"loss": 0.4643, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.1193415637860082, |
|
"grad_norm": 0.41252079635226235, |
|
"learning_rate": 7.62669040470829e-05, |
|
"loss": 0.4643, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.1412894375857339, |
|
"grad_norm": 0.512997391865451, |
|
"learning_rate": 7.60001060827412e-05, |
|
"loss": 0.4566, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.1632373113854595, |
|
"grad_norm": 0.5520989749480719, |
|
"learning_rate": 7.572460064747167e-05, |
|
"loss": 0.4553, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.1851851851851851, |
|
"grad_norm": 0.48682230835165313, |
|
"learning_rate": 7.544045437873259e-05, |
|
"loss": 0.4503, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.2071330589849107, |
|
"grad_norm": 0.40890214708757283, |
|
"learning_rate": 7.514773600397076e-05, |
|
"loss": 0.449, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.2290809327846364, |
|
"grad_norm": 0.4496051089290013, |
|
"learning_rate": 7.484651632399802e-05, |
|
"loss": 0.4486, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.2510288065843622, |
|
"grad_norm": 0.3385328491192872, |
|
"learning_rate": 7.453686819586655e-05, |
|
"loss": 0.4473, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.2729766803840878, |
|
"grad_norm": 0.5042801234415435, |
|
"learning_rate": 7.421886651524656e-05, |
|
"loss": 0.4419, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.2949245541838135, |
|
"grad_norm": 0.4013529469572051, |
|
"learning_rate": 7.38925881983111e-05, |
|
"loss": 0.4463, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.316872427983539, |
|
"grad_norm": 0.3526194553768413, |
|
"learning_rate": 7.355811216313206e-05, |
|
"loss": 0.4446, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.3388203017832647, |
|
"grad_norm": 0.3798310275778352, |
|
"learning_rate": 7.321551931059191e-05, |
|
"loss": 0.4548, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.3607681755829903, |
|
"grad_norm": 0.4050313385315871, |
|
"learning_rate": 7.286489250481604e-05, |
|
"loss": 0.4473, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.382716049382716, |
|
"grad_norm": 0.5307566330866562, |
|
"learning_rate": 7.250631655313001e-05, |
|
"loss": 0.4428, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.4046639231824418, |
|
"grad_norm": 0.5886517904245452, |
|
"learning_rate": 7.213987818554704e-05, |
|
"loss": 0.4439, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.4266117969821672, |
|
"grad_norm": 0.7870712747634117, |
|
"learning_rate": 7.176566603379015e-05, |
|
"loss": 0.4553, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.448559670781893, |
|
"grad_norm": 1.0045619121977905, |
|
"learning_rate": 7.138377060985465e-05, |
|
"loss": 0.4467, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.4705075445816187, |
|
"grad_norm": 1.0300458801690289, |
|
"learning_rate": 7.09942842841156e-05, |
|
"loss": 0.4481, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.4924554183813443, |
|
"grad_norm": 0.6788257591381656, |
|
"learning_rate": 7.059730126298591e-05, |
|
"loss": 0.4446, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.51440329218107, |
|
"grad_norm": 0.5581173116767874, |
|
"learning_rate": 7.019291756613029e-05, |
|
"loss": 0.4397, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.5363511659807956, |
|
"grad_norm": 0.740704532124767, |
|
"learning_rate": 6.978123100324061e-05, |
|
"loss": 0.4402, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.5582990397805214, |
|
"grad_norm": 0.7781231979915648, |
|
"learning_rate": 6.936234115037842e-05, |
|
"loss": 0.4429, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.5802469135802468, |
|
"grad_norm": 0.6101020638409748, |
|
"learning_rate": 6.893634932589e-05, |
|
"loss": 0.4433, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.6021947873799727, |
|
"grad_norm": 0.6338457181714169, |
|
"learning_rate": 6.85033585659003e-05, |
|
"loss": 0.4389, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.624142661179698, |
|
"grad_norm": 0.6492977834572081, |
|
"learning_rate": 6.806347359939107e-05, |
|
"loss": 0.4351, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.646090534979424, |
|
"grad_norm": 0.392861233479167, |
|
"learning_rate": 6.761680082286988e-05, |
|
"loss": 0.4399, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.6680384087791496, |
|
"grad_norm": 0.4720757546111325, |
|
"learning_rate": 6.716344827463545e-05, |
|
"loss": 0.4349, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.6899862825788752, |
|
"grad_norm": 0.518992673759433, |
|
"learning_rate": 6.670352560864615e-05, |
|
"loss": 0.4384, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.7119341563786008, |
|
"grad_norm": 0.3064648926060582, |
|
"learning_rate": 6.62371440679976e-05, |
|
"loss": 0.4298, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.7338820301783264, |
|
"grad_norm": 0.2952380631214821, |
|
"learning_rate": 6.576441645801592e-05, |
|
"loss": 0.4384, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.7558299039780523, |
|
"grad_norm": 0.3619917164563617, |
|
"learning_rate": 6.528545711897307e-05, |
|
"loss": 0.4367, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 0.3352348275889567, |
|
"learning_rate": 6.480038189843101e-05, |
|
"loss": 0.4411, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.7997256515775035, |
|
"grad_norm": 0.3510930618032657, |
|
"learning_rate": 6.430930812322127e-05, |
|
"loss": 0.4386, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.821673525377229, |
|
"grad_norm": 0.4010609683995807, |
|
"learning_rate": 6.381235457106664e-05, |
|
"loss": 0.4298, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.8436213991769548, |
|
"grad_norm": 0.36451457683299865, |
|
"learning_rate": 6.330964144185204e-05, |
|
"loss": 0.4371, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.8655692729766804, |
|
"grad_norm": 0.27838534336167414, |
|
"learning_rate": 6.280129032855132e-05, |
|
"loss": 0.4305, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.887517146776406, |
|
"grad_norm": 0.2966462979039974, |
|
"learning_rate": 6.228742418781714e-05, |
|
"loss": 0.4388, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.9094650205761317, |
|
"grad_norm": 0.26944878326607974, |
|
"learning_rate": 6.17681673102411e-05, |
|
"loss": 0.4345, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.9314128943758573, |
|
"grad_norm": 0.2712568832140498, |
|
"learning_rate": 6.1243645290291e-05, |
|
"loss": 0.4308, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.9533607681755831, |
|
"grad_norm": 0.2866166967589882, |
|
"learning_rate": 6.0713984995933016e-05, |
|
"loss": 0.4325, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.9753086419753085, |
|
"grad_norm": 0.23491982047748894, |
|
"learning_rate": 6.01793145379456e-05, |
|
"loss": 0.4321, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.9972565157750344, |
|
"grad_norm": 0.205586198763508, |
|
"learning_rate": 5.9639763238932893e-05, |
|
"loss": 0.4345, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.01920438957476, |
|
"grad_norm": 0.27481178105933207, |
|
"learning_rate": 5.909546160204508e-05, |
|
"loss": 0.4013, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.0411522633744856, |
|
"grad_norm": 0.30806418646748673, |
|
"learning_rate": 5.8546541279413094e-05, |
|
"loss": 0.3976, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 2.063100137174211, |
|
"grad_norm": 0.3291669273065511, |
|
"learning_rate": 5.799313504030545e-05, |
|
"loss": 0.3947, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 2.085048010973937, |
|
"grad_norm": 0.37474732162243923, |
|
"learning_rate": 5.743537673901485e-05, |
|
"loss": 0.3971, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 2.1069958847736627, |
|
"grad_norm": 0.46251544205958794, |
|
"learning_rate": 5.68734012824825e-05, |
|
"loss": 0.396, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 2.128943758573388, |
|
"grad_norm": 0.54656137847099, |
|
"learning_rate": 5.6307344597667555e-05, |
|
"loss": 0.4027, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 2.150891632373114, |
|
"grad_norm": 0.6049235984601056, |
|
"learning_rate": 5.5737343598670104e-05, |
|
"loss": 0.4005, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 2.1728395061728394, |
|
"grad_norm": 0.5234566946423048, |
|
"learning_rate": 5.5163536153615185e-05, |
|
"loss": 0.3955, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 2.1947873799725652, |
|
"grad_norm": 0.4013877298974358, |
|
"learning_rate": 5.4586061051306204e-05, |
|
"loss": 0.4028, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.2167352537722906, |
|
"grad_norm": 0.3845465210881421, |
|
"learning_rate": 5.4005057967655634e-05, |
|
"loss": 0.3976, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 2.2386831275720165, |
|
"grad_norm": 0.408459585720323, |
|
"learning_rate": 5.3420667431901e-05, |
|
"loss": 0.3975, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 2.260631001371742, |
|
"grad_norm": 0.37864175367916847, |
|
"learning_rate": 5.283303079261471e-05, |
|
"loss": 0.3936, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 2.2825788751714677, |
|
"grad_norm": 0.3827028198880068, |
|
"learning_rate": 5.22422901835155e-05, |
|
"loss": 0.3936, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 2.3045267489711936, |
|
"grad_norm": 0.3484431146343949, |
|
"learning_rate": 5.164858848909009e-05, |
|
"loss": 0.388, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 2.326474622770919, |
|
"grad_norm": 0.3095716919680074, |
|
"learning_rate": 5.1052069310033216e-05, |
|
"loss": 0.3988, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 2.348422496570645, |
|
"grad_norm": 0.39008928135047216, |
|
"learning_rate": 5.0452876928514434e-05, |
|
"loss": 0.3966, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 2.3703703703703702, |
|
"grad_norm": 0.44827732087715033, |
|
"learning_rate": 4.9851156273280064e-05, |
|
"loss": 0.3953, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 2.392318244170096, |
|
"grad_norm": 0.2796243136669204, |
|
"learning_rate": 4.92470528845988e-05, |
|
"loss": 0.3964, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 2.4142661179698215, |
|
"grad_norm": 0.3186398495286058, |
|
"learning_rate": 4.8640712879059354e-05, |
|
"loss": 0.3923, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.4362139917695473, |
|
"grad_norm": 0.35322375158512703, |
|
"learning_rate": 4.8032282914228743e-05, |
|
"loss": 0.3886, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 2.4581618655692727, |
|
"grad_norm": 0.18461936703017578, |
|
"learning_rate": 4.742191015317974e-05, |
|
"loss": 0.3917, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 2.4801097393689986, |
|
"grad_norm": 0.33683641542461873, |
|
"learning_rate": 4.680974222889595e-05, |
|
"loss": 0.394, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 2.5020576131687244, |
|
"grad_norm": 0.3169424542788605, |
|
"learning_rate": 4.6195927208563394e-05, |
|
"loss": 0.3871, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 2.52400548696845, |
|
"grad_norm": 0.15430100844009037, |
|
"learning_rate": 4.558061355775693e-05, |
|
"loss": 0.3857, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 2.5459533607681757, |
|
"grad_norm": 0.2820303722197663, |
|
"learning_rate": 4.496395010453038e-05, |
|
"loss": 0.3886, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 2.567901234567901, |
|
"grad_norm": 0.24770492735610264, |
|
"learning_rate": 4.4346086003418985e-05, |
|
"loss": 0.3882, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 2.589849108367627, |
|
"grad_norm": 0.15993713137779939, |
|
"learning_rate": 4.372717069936287e-05, |
|
"loss": 0.3891, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 2.611796982167353, |
|
"grad_norm": 0.24474244306163354, |
|
"learning_rate": 4.310735389156026e-05, |
|
"loss": 0.392, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 2.633744855967078, |
|
"grad_norm": 0.1905642453010369, |
|
"learning_rate": 4.248678549725923e-05, |
|
"loss": 0.3917, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.6556927297668036, |
|
"grad_norm": 0.15617551076050876, |
|
"learning_rate": 4.18656156154967e-05, |
|
"loss": 0.3945, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 2.6776406035665294, |
|
"grad_norm": 0.21098249998156268, |
|
"learning_rate": 4.124399449079348e-05, |
|
"loss": 0.3866, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 2.6995884773662553, |
|
"grad_norm": 0.17151651983910782, |
|
"learning_rate": 4.0622072476814045e-05, |
|
"loss": 0.3872, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 2.7215363511659807, |
|
"grad_norm": 0.14636727205415834, |
|
"learning_rate": 4e-05, |
|
"loss": 0.3919, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 2.7434842249657065, |
|
"grad_norm": 0.21883566171534793, |
|
"learning_rate": 3.937792752318597e-05, |
|
"loss": 0.3854, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.765432098765432, |
|
"grad_norm": 0.15504692956699287, |
|
"learning_rate": 3.8756005509206535e-05, |
|
"loss": 0.3883, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 2.787379972565158, |
|
"grad_norm": 0.21026818171396724, |
|
"learning_rate": 3.81343843845033e-05, |
|
"loss": 0.3883, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 2.8093278463648836, |
|
"grad_norm": 0.1846577189184653, |
|
"learning_rate": 3.751321450274078e-05, |
|
"loss": 0.3921, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 2.831275720164609, |
|
"grad_norm": 0.16874522256869426, |
|
"learning_rate": 3.689264610843975e-05, |
|
"loss": 0.3911, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 2.8532235939643344, |
|
"grad_norm": 0.16523585002563423, |
|
"learning_rate": 3.627282930063714e-05, |
|
"loss": 0.3886, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.8751714677640603, |
|
"grad_norm": 0.14959343517939727, |
|
"learning_rate": 3.565391399658102e-05, |
|
"loss": 0.392, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 2.897119341563786, |
|
"grad_norm": 0.13667256750591955, |
|
"learning_rate": 3.503604989546963e-05, |
|
"loss": 0.3846, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.9190672153635115, |
|
"grad_norm": 0.14202188950732816, |
|
"learning_rate": 3.4419386442243084e-05, |
|
"loss": 0.3902, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 2.9410150891632374, |
|
"grad_norm": 0.14821759155695716, |
|
"learning_rate": 3.380407279143661e-05, |
|
"loss": 0.3867, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.962962962962963, |
|
"grad_norm": 0.1667332312440918, |
|
"learning_rate": 3.3190257771104055e-05, |
|
"loss": 0.3927, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.9849108367626886, |
|
"grad_norm": 0.14452591810498722, |
|
"learning_rate": 3.257808984682027e-05, |
|
"loss": 0.3855, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 3.006858710562414, |
|
"grad_norm": 0.18251829909357556, |
|
"learning_rate": 3.196771708577127e-05, |
|
"loss": 0.3837, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 3.02880658436214, |
|
"grad_norm": 0.17364339659347747, |
|
"learning_rate": 3.135928712094067e-05, |
|
"loss": 0.359, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 3.0507544581618657, |
|
"grad_norm": 0.17747923559690468, |
|
"learning_rate": 3.075294711540123e-05, |
|
"loss": 0.3557, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 3.072702331961591, |
|
"grad_norm": 0.1787283180499878, |
|
"learning_rate": 3.0148843726719953e-05, |
|
"loss": 0.3516, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 3.094650205761317, |
|
"grad_norm": 0.19311413443289224, |
|
"learning_rate": 2.9547123071485586e-05, |
|
"loss": 0.3532, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 3.1165980795610424, |
|
"grad_norm": 0.15814186428435004, |
|
"learning_rate": 2.8947930689966798e-05, |
|
"loss": 0.3598, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 3.1385459533607682, |
|
"grad_norm": 0.17222178104775263, |
|
"learning_rate": 2.8351411510909926e-05, |
|
"loss": 0.3567, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 3.1604938271604937, |
|
"grad_norm": 0.15304672551321433, |
|
"learning_rate": 2.7757709816484512e-05, |
|
"loss": 0.3634, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 3.1824417009602195, |
|
"grad_norm": 0.15457992995371023, |
|
"learning_rate": 2.71669692073853e-05, |
|
"loss": 0.3542, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 3.2043895747599453, |
|
"grad_norm": 0.1479490816574568, |
|
"learning_rate": 2.6579332568099006e-05, |
|
"loss": 0.3576, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 3.2263374485596708, |
|
"grad_norm": 0.15469338097636914, |
|
"learning_rate": 2.5994942032344376e-05, |
|
"loss": 0.357, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 3.2482853223593966, |
|
"grad_norm": 0.13891659760939676, |
|
"learning_rate": 2.54139389486938e-05, |
|
"loss": 0.3559, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 3.270233196159122, |
|
"grad_norm": 0.143615476750949, |
|
"learning_rate": 2.4836463846384832e-05, |
|
"loss": 0.3529, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 3.292181069958848, |
|
"grad_norm": 0.1420809451110692, |
|
"learning_rate": 2.4262656401329913e-05, |
|
"loss": 0.359, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 3.3141289437585733, |
|
"grad_norm": 0.14089610783501166, |
|
"learning_rate": 2.3692655402332455e-05, |
|
"loss": 0.3565, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 3.336076817558299, |
|
"grad_norm": 0.1402483457342098, |
|
"learning_rate": 2.3126598717517514e-05, |
|
"loss": 0.3508, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 3.3580246913580245, |
|
"grad_norm": 0.12317925977087589, |
|
"learning_rate": 2.256462326098516e-05, |
|
"loss": 0.3558, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 3.3799725651577504, |
|
"grad_norm": 0.1452523632260013, |
|
"learning_rate": 2.200686495969457e-05, |
|
"loss": 0.3546, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 3.401920438957476, |
|
"grad_norm": 0.11788796867275445, |
|
"learning_rate": 2.1453458720586902e-05, |
|
"loss": 0.3562, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 3.4238683127572016, |
|
"grad_norm": 0.14156518711907154, |
|
"learning_rate": 2.0904538397954913e-05, |
|
"loss": 0.3595, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 3.4458161865569275, |
|
"grad_norm": 0.1152521891751902, |
|
"learning_rate": 2.0360236761067117e-05, |
|
"loss": 0.3572, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 3.467764060356653, |
|
"grad_norm": 0.1292120630664868, |
|
"learning_rate": 1.9820685462054413e-05, |
|
"loss": 0.3608, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 3.4897119341563787, |
|
"grad_norm": 0.12307673484565831, |
|
"learning_rate": 1.9286015004066984e-05, |
|
"loss": 0.3571, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 3.511659807956104, |
|
"grad_norm": 0.11387530947074216, |
|
"learning_rate": 1.8756354709708998e-05, |
|
"loss": 0.3593, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.53360768175583, |
|
"grad_norm": 0.1276131097306211, |
|
"learning_rate": 1.8231832689758903e-05, |
|
"loss": 0.3528, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 3.5555555555555554, |
|
"grad_norm": 0.11562929678821103, |
|
"learning_rate": 1.771257581218287e-05, |
|
"loss": 0.3562, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 3.577503429355281, |
|
"grad_norm": 0.11195216588496365, |
|
"learning_rate": 1.7198709671448696e-05, |
|
"loss": 0.3568, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 3.599451303155007, |
|
"grad_norm": 0.10790427870554664, |
|
"learning_rate": 1.6690358558147967e-05, |
|
"loss": 0.3522, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 3.6213991769547325, |
|
"grad_norm": 0.11612616520204616, |
|
"learning_rate": 1.6187645428933372e-05, |
|
"loss": 0.3584, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 3.6433470507544583, |
|
"grad_norm": 0.11400211648602655, |
|
"learning_rate": 1.5690691876778746e-05, |
|
"loss": 0.3622, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 3.6652949245541837, |
|
"grad_norm": 0.1065206298660862, |
|
"learning_rate": 1.5199618101569003e-05, |
|
"loss": 0.3544, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 3.6872427983539096, |
|
"grad_norm": 0.10461159135503716, |
|
"learning_rate": 1.4714542881026947e-05, |
|
"loss": 0.3519, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 3.709190672153635, |
|
"grad_norm": 0.12285024075507715, |
|
"learning_rate": 1.4235583541984092e-05, |
|
"loss": 0.3621, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 3.731138545953361, |
|
"grad_norm": 0.10841173453177154, |
|
"learning_rate": 1.3762855932002404e-05, |
|
"loss": 0.3554, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.753086419753086, |
|
"grad_norm": 0.10185093830948505, |
|
"learning_rate": 1.3296474391353854e-05, |
|
"loss": 0.3602, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 3.775034293552812, |
|
"grad_norm": 0.10890679245723875, |
|
"learning_rate": 1.2836551725364572e-05, |
|
"loss": 0.3489, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 3.796982167352538, |
|
"grad_norm": 0.10191643511945396, |
|
"learning_rate": 1.2383199177130135e-05, |
|
"loss": 0.3536, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 3.8189300411522633, |
|
"grad_norm": 0.09752716587838449, |
|
"learning_rate": 1.1936526400608938e-05, |
|
"loss": 0.3517, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 3.840877914951989, |
|
"grad_norm": 0.09567252814321132, |
|
"learning_rate": 1.1496641434099725e-05, |
|
"loss": 0.3499, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 3.8628257887517146, |
|
"grad_norm": 0.10384979007917372, |
|
"learning_rate": 1.1063650674110011e-05, |
|
"loss": 0.3562, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 3.8847736625514404, |
|
"grad_norm": 0.09268596571493516, |
|
"learning_rate": 1.0637658849621593e-05, |
|
"loss": 0.3521, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 3.9067215363511663, |
|
"grad_norm": 0.08906274897668208, |
|
"learning_rate": 1.0218768996759399e-05, |
|
"loss": 0.3528, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 3.9286694101508917, |
|
"grad_norm": 0.09311271485149279, |
|
"learning_rate": 9.807082433869727e-06, |
|
"loss": 0.359, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 3.950617283950617, |
|
"grad_norm": 0.09966102704447338, |
|
"learning_rate": 9.402698737014098e-06, |
|
"loss": 0.3643, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.972565157750343, |
|
"grad_norm": 0.08428095680890077, |
|
"learning_rate": 9.005715715884409e-06, |
|
"loss": 0.3564, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 3.9945130315500688, |
|
"grad_norm": 0.098824142446022, |
|
"learning_rate": 8.616229390145361e-06, |
|
"loss": 0.3561, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 4.065843621399177, |
|
"grad_norm": 0.13982352922009822, |
|
"learning_rate": 8.23433396620986e-06, |
|
"loss": 0.341, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 4.0877914951989025, |
|
"grad_norm": 0.11357790917439833, |
|
"learning_rate": 7.86012181445297e-06, |
|
"loss": 0.3427, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 4.109739368998628, |
|
"grad_norm": 0.09508337442980856, |
|
"learning_rate": 7.4936834468699945e-06, |
|
"loss": 0.337, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 4.131687242798354, |
|
"grad_norm": 0.10345088268864072, |
|
"learning_rate": 7.135107495183975e-06, |
|
"loss": 0.3345, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 4.153635116598079, |
|
"grad_norm": 0.1048793029567903, |
|
"learning_rate": 6.784480689408099e-06, |
|
"loss": 0.3297, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 4.175582990397805, |
|
"grad_norm": 0.10481831928351991, |
|
"learning_rate": 6.441887836867962e-06, |
|
"loss": 0.3414, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 4.197530864197531, |
|
"grad_norm": 0.11340261438354692, |
|
"learning_rate": 6.107411801688905e-06, |
|
"loss": 0.3387, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 4.219478737997257, |
|
"grad_norm": 0.10558490986119158, |
|
"learning_rate": 5.781133484753451e-06, |
|
"loss": 0.3386, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 4.2414266117969825, |
|
"grad_norm": 0.09563394121242606, |
|
"learning_rate": 5.463131804133461e-06, |
|
"loss": 0.3392, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 4.2633744855967075, |
|
"grad_norm": 0.097884687531601, |
|
"learning_rate": 5.1534836760019824e-06, |
|
"loss": 0.3406, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 4.285322359396433, |
|
"grad_norm": 0.09586101042431322, |
|
"learning_rate": 4.852263996029259e-06, |
|
"loss": 0.3347, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 4.307270233196159, |
|
"grad_norm": 0.09449695957524798, |
|
"learning_rate": 4.559545621267414e-06, |
|
"loss": 0.3316, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 4.329218106995885, |
|
"grad_norm": 0.09218455432645911, |
|
"learning_rate": 4.275399352528342e-06, |
|
"loss": 0.3382, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 4.35116598079561, |
|
"grad_norm": 0.09122205514695, |
|
"learning_rate": 3.999893917258799e-06, |
|
"loss": 0.3421, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 4.373113854595336, |
|
"grad_norm": 0.08884394525292823, |
|
"learning_rate": 3.733095952917101e-06, |
|
"loss": 0.3359, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 4.395061728395062, |
|
"grad_norm": 0.08788073543014369, |
|
"learning_rate": 3.4750699908552464e-06, |
|
"loss": 0.3348, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 4.4170096021947876, |
|
"grad_norm": 0.0851504561450446, |
|
"learning_rate": 3.225878440710544e-06, |
|
"loss": 0.338, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 4.438957475994513, |
|
"grad_norm": 0.08806355910065146, |
|
"learning_rate": 2.9855815753103436e-06, |
|
"loss": 0.3408, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.460905349794238, |
|
"grad_norm": 0.08443831027497156, |
|
"learning_rate": 2.754237516093623e-06, |
|
"loss": 0.3336, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 4.482853223593964, |
|
"grad_norm": 0.0846796048507417, |
|
"learning_rate": 2.5319022190529997e-06, |
|
"loss": 0.3347, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 4.50480109739369, |
|
"grad_norm": 0.08433609787768302, |
|
"learning_rate": 2.3186294612004365e-06, |
|
"loss": 0.3359, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 4.526748971193416, |
|
"grad_norm": 0.08573881642215767, |
|
"learning_rate": 2.1144708275599955e-06, |
|
"loss": 0.3411, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 4.548696844993142, |
|
"grad_norm": 0.08265953354626032, |
|
"learning_rate": 1.9194756986908025e-06, |
|
"loss": 0.3351, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 4.570644718792867, |
|
"grad_norm": 0.07968266677719599, |
|
"learning_rate": 1.7336912387432115e-06, |
|
"loss": 0.3329, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 4.592592592592593, |
|
"grad_norm": 0.07672083627731244, |
|
"learning_rate": 1.5571623840510185e-06, |
|
"loss": 0.3344, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 4.614540466392318, |
|
"grad_norm": 0.07767236853772148, |
|
"learning_rate": 1.3899318322625744e-06, |
|
"loss": 0.3353, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 4.636488340192044, |
|
"grad_norm": 0.079617742448113, |
|
"learning_rate": 1.2320400320133551e-06, |
|
"loss": 0.3351, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 4.658436213991769, |
|
"grad_norm": 0.07817205047145995, |
|
"learning_rate": 1.0835251731425013e-06, |
|
"loss": 0.3389, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 4.680384087791495, |
|
"grad_norm": 0.079996413423676, |
|
"learning_rate": 9.444231774557199e-07, |
|
"loss": 0.3407, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 4.702331961591221, |
|
"grad_norm": 0.07582115660185369, |
|
"learning_rate": 8.147676900367308e-07, |
|
"loss": 0.3412, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 4.724279835390947, |
|
"grad_norm": 0.07655208386535386, |
|
"learning_rate": 6.945900711094534e-07, |
|
"loss": 0.3308, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 4.746227709190672, |
|
"grad_norm": 0.07575601990026438, |
|
"learning_rate": 5.839193884527472e-07, |
|
"loss": 0.3326, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 4.768175582990398, |
|
"grad_norm": 0.07445403870736927, |
|
"learning_rate": 4.827824103697332e-07, |
|
"loss": 0.3312, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 4.790123456790123, |
|
"grad_norm": 0.07565335743085141, |
|
"learning_rate": 3.912035992132257e-07, |
|
"loss": 0.3396, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 4.812071330589849, |
|
"grad_norm": 0.07504738171668904, |
|
"learning_rate": 3.0920510546894156e-07, |
|
"loss": 0.3279, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 4.834019204389575, |
|
"grad_norm": 0.07495711123910379, |
|
"learning_rate": 2.3680676239789647e-07, |
|
"loss": 0.3373, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 4.8559670781893, |
|
"grad_norm": 0.07581516860262182, |
|
"learning_rate": 1.740260812392558e-07, |
|
"loss": 0.3377, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 4.877914951989026, |
|
"grad_norm": 0.07440451827362232, |
|
"learning_rate": 1.208782469748293e-07, |
|
"loss": 0.3375, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.899862825788752, |
|
"grad_norm": 0.07405892919582897, |
|
"learning_rate": 7.737611465622686e-08, |
|
"loss": 0.3314, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 4.921810699588478, |
|
"grad_norm": 0.07208525046101905, |
|
"learning_rate": 4.353020629556781e-08, |
|
"loss": 0.334, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 4.9437585733882035, |
|
"grad_norm": 0.07250510923845269, |
|
"learning_rate": 1.934870832047686e-08, |
|
"loss": 0.3358, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 4.965706447187928, |
|
"grad_norm": 0.07367572989174814, |
|
"learning_rate": 4.837469594018984e-09, |
|
"loss": 0.3436, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 4.987654320987654, |
|
"grad_norm": 0.07408156451999265, |
|
"learning_rate": 0.0, |
|
"loss": 0.337, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 4.987654320987654, |
|
"step": 225, |
|
"total_flos": 5.796779967165497e+18, |
|
"train_loss": 0.06433244016435412, |
|
"train_runtime": 4471.4422, |
|
"train_samples_per_second": 26.061, |
|
"train_steps_per_second": 0.05 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 225, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.796779967165497e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|