|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.991735537190083, |
|
"eval_steps": 500, |
|
"global_step": 715, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006959547629404089, |
|
"grad_norm": 6.299945556626471, |
|
"learning_rate": 1.111111111111111e-06, |
|
"loss": 0.8825, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.013919095258808177, |
|
"grad_norm": 6.257876699073014, |
|
"learning_rate": 2.222222222222222e-06, |
|
"loss": 0.8704, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.020878642888212267, |
|
"grad_norm": 6.160373581422307, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.8713, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.027838190517616355, |
|
"grad_norm": 5.78775826291232, |
|
"learning_rate": 4.444444444444444e-06, |
|
"loss": 0.8598, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.034797738147020446, |
|
"grad_norm": 4.434647119841161, |
|
"learning_rate": 5.555555555555557e-06, |
|
"loss": 0.8172, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.041757285776424534, |
|
"grad_norm": 2.3263779022698095, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.7532, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.04871683340582862, |
|
"grad_norm": 4.051177439739189, |
|
"learning_rate": 7.77777777777778e-06, |
|
"loss": 0.7557, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.05567638103523271, |
|
"grad_norm": 4.175202353129295, |
|
"learning_rate": 8.888888888888888e-06, |
|
"loss": 0.7655, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0626359286646368, |
|
"grad_norm": 3.8871866859374617, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7274, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.06959547629404089, |
|
"grad_norm": 4.009248099328964, |
|
"learning_rate": 1.1111111111111113e-05, |
|
"loss": 0.6947, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07655502392344497, |
|
"grad_norm": 3.2381642347145796, |
|
"learning_rate": 1.2222222222222224e-05, |
|
"loss": 0.6795, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.08351457155284907, |
|
"grad_norm": 2.0504476986085827, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 0.6602, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.09047411918225315, |
|
"grad_norm": 2.384280645275452, |
|
"learning_rate": 1.4444444444444446e-05, |
|
"loss": 0.6394, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.09743366681165724, |
|
"grad_norm": 2.419324966834746, |
|
"learning_rate": 1.555555555555556e-05, |
|
"loss": 0.6282, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.10439321444106132, |
|
"grad_norm": 1.4468314839239673, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.6092, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.11135276207046542, |
|
"grad_norm": 1.1678600409520985, |
|
"learning_rate": 1.7777777777777777e-05, |
|
"loss": 0.5894, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.11831230969986951, |
|
"grad_norm": 1.264682985827519, |
|
"learning_rate": 1.888888888888889e-05, |
|
"loss": 0.5968, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.1252718573292736, |
|
"grad_norm": 0.6754725868857698, |
|
"learning_rate": 2e-05, |
|
"loss": 0.576, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.1322314049586777, |
|
"grad_norm": 0.9074270406226251, |
|
"learning_rate": 2.1111111111111114e-05, |
|
"loss": 0.5698, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.13919095258808178, |
|
"grad_norm": 0.7979533790293932, |
|
"learning_rate": 2.2222222222222227e-05, |
|
"loss": 0.5588, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.14615050021748585, |
|
"grad_norm": 0.6581363594356142, |
|
"learning_rate": 2.3333333333333336e-05, |
|
"loss": 0.5606, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.15311004784688995, |
|
"grad_norm": 0.6691967855765695, |
|
"learning_rate": 2.444444444444445e-05, |
|
"loss": 0.5481, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.16006959547629404, |
|
"grad_norm": 0.542444666355929, |
|
"learning_rate": 2.5555555555555554e-05, |
|
"loss": 0.5539, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.16702914310569814, |
|
"grad_norm": 0.6416430096126567, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 0.5501, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.17398869073510223, |
|
"grad_norm": 0.5329567057613817, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 0.5342, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.1809482383645063, |
|
"grad_norm": 0.6011450434974139, |
|
"learning_rate": 2.888888888888889e-05, |
|
"loss": 0.5348, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.1879077859939104, |
|
"grad_norm": 0.4976703306853586, |
|
"learning_rate": 3.0000000000000004e-05, |
|
"loss": 0.5322, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.1948673336233145, |
|
"grad_norm": 0.5730627506660203, |
|
"learning_rate": 3.111111111111112e-05, |
|
"loss": 0.5213, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.20182688125271858, |
|
"grad_norm": 0.7301409032698557, |
|
"learning_rate": 3.222222222222223e-05, |
|
"loss": 0.5206, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.20878642888212265, |
|
"grad_norm": 1.4025503659857947, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 0.5322, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.21574597651152674, |
|
"grad_norm": 0.8305463760946818, |
|
"learning_rate": 3.444444444444445e-05, |
|
"loss": 0.5176, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.22270552414093084, |
|
"grad_norm": 0.8468215550610021, |
|
"learning_rate": 3.555555555555555e-05, |
|
"loss": 0.5187, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.22966507177033493, |
|
"grad_norm": 0.8897899781711042, |
|
"learning_rate": 3.6666666666666666e-05, |
|
"loss": 0.5206, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.23662461939973903, |
|
"grad_norm": 1.1213311013190945, |
|
"learning_rate": 3.777777777777778e-05, |
|
"loss": 0.5035, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.2435841670291431, |
|
"grad_norm": 1.1010420447489897, |
|
"learning_rate": 3.888888888888889e-05, |
|
"loss": 0.5072, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2505437146585472, |
|
"grad_norm": 0.773476718518657, |
|
"learning_rate": 4e-05, |
|
"loss": 0.5077, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.2575032622879513, |
|
"grad_norm": 1.2400452716206256, |
|
"learning_rate": 4.111111111111111e-05, |
|
"loss": 0.5033, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2644628099173554, |
|
"grad_norm": 0.9153450607625541, |
|
"learning_rate": 4.222222222222223e-05, |
|
"loss": 0.5, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.2714223575467595, |
|
"grad_norm": 0.6514251195810624, |
|
"learning_rate": 4.3333333333333334e-05, |
|
"loss": 0.4947, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.27838190517616357, |
|
"grad_norm": 1.0042919223967974, |
|
"learning_rate": 4.444444444444445e-05, |
|
"loss": 0.5042, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.28534145280556766, |
|
"grad_norm": 1.0797096325295303, |
|
"learning_rate": 4.555555555555556e-05, |
|
"loss": 0.4916, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.2923010004349717, |
|
"grad_norm": 0.9905878602993525, |
|
"learning_rate": 4.666666666666667e-05, |
|
"loss": 0.496, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.2992605480643758, |
|
"grad_norm": 0.8676969083941743, |
|
"learning_rate": 4.777777777777778e-05, |
|
"loss": 0.4885, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.3062200956937799, |
|
"grad_norm": 0.7260235825278305, |
|
"learning_rate": 4.88888888888889e-05, |
|
"loss": 0.5005, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.313179643323184, |
|
"grad_norm": 0.9223906328687149, |
|
"learning_rate": 5e-05, |
|
"loss": 0.4892, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.3201391909525881, |
|
"grad_norm": 1.3832086370219072, |
|
"learning_rate": 5.111111111111111e-05, |
|
"loss": 0.4997, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3270987385819922, |
|
"grad_norm": 0.8419161235221501, |
|
"learning_rate": 5.222222222222223e-05, |
|
"loss": 0.4938, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.33405828621139627, |
|
"grad_norm": 1.6195340381200483, |
|
"learning_rate": 5.333333333333333e-05, |
|
"loss": 0.4968, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.34101783384080037, |
|
"grad_norm": 0.9275367275665349, |
|
"learning_rate": 5.444444444444445e-05, |
|
"loss": 0.5006, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.34797738147020446, |
|
"grad_norm": 1.7543919560909402, |
|
"learning_rate": 5.555555555555556e-05, |
|
"loss": 0.5007, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3549369290996085, |
|
"grad_norm": 1.26313567421099, |
|
"learning_rate": 5.666666666666668e-05, |
|
"loss": 0.499, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.3618964767290126, |
|
"grad_norm": 1.8899821239654098, |
|
"learning_rate": 5.777777777777778e-05, |
|
"loss": 0.4868, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.3688560243584167, |
|
"grad_norm": 1.4836204192299145, |
|
"learning_rate": 5.8888888888888896e-05, |
|
"loss": 0.4924, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.3758155719878208, |
|
"grad_norm": 1.5371932375940351, |
|
"learning_rate": 6.000000000000001e-05, |
|
"loss": 0.4853, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.3827751196172249, |
|
"grad_norm": 1.066878304885815, |
|
"learning_rate": 6.111111111111111e-05, |
|
"loss": 0.4823, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.389734667246629, |
|
"grad_norm": 1.234430905173555, |
|
"learning_rate": 6.222222222222223e-05, |
|
"loss": 0.4848, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.39669421487603307, |
|
"grad_norm": 1.0923409666404706, |
|
"learning_rate": 6.333333333333333e-05, |
|
"loss": 0.494, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.40365376250543716, |
|
"grad_norm": 0.9800617899091041, |
|
"learning_rate": 6.444444444444446e-05, |
|
"loss": 0.4825, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.41061331013484126, |
|
"grad_norm": 0.9212766482645198, |
|
"learning_rate": 6.555555555555556e-05, |
|
"loss": 0.4691, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.4175728577642453, |
|
"grad_norm": 1.167155227826628, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 0.4815, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4245324053936494, |
|
"grad_norm": 1.5791917226157102, |
|
"learning_rate": 6.777777777777778e-05, |
|
"loss": 0.4943, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.4314919530230535, |
|
"grad_norm": 1.0997180103791135, |
|
"learning_rate": 6.88888888888889e-05, |
|
"loss": 0.4871, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.4384515006524576, |
|
"grad_norm": 1.2130023175807059, |
|
"learning_rate": 7.000000000000001e-05, |
|
"loss": 0.4855, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.4454110482818617, |
|
"grad_norm": 1.6270954136094906, |
|
"learning_rate": 7.11111111111111e-05, |
|
"loss": 0.4877, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.45237059591126577, |
|
"grad_norm": 1.1304632174516827, |
|
"learning_rate": 7.222222222222223e-05, |
|
"loss": 0.4795, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.45933014354066987, |
|
"grad_norm": 1.32786525260077, |
|
"learning_rate": 7.333333333333333e-05, |
|
"loss": 0.4815, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.46628969117007396, |
|
"grad_norm": 0.6938586247846547, |
|
"learning_rate": 7.444444444444446e-05, |
|
"loss": 0.4711, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.47324923879947806, |
|
"grad_norm": 1.3238232457797845, |
|
"learning_rate": 7.555555555555556e-05, |
|
"loss": 0.4823, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.4802087864288821, |
|
"grad_norm": 0.7731792383221893, |
|
"learning_rate": 7.666666666666668e-05, |
|
"loss": 0.4769, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.4871683340582862, |
|
"grad_norm": 0.714435674612326, |
|
"learning_rate": 7.777777777777778e-05, |
|
"loss": 0.47, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4941278816876903, |
|
"grad_norm": 0.727161798048739, |
|
"learning_rate": 7.88888888888889e-05, |
|
"loss": 0.4748, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.5010874293170944, |
|
"grad_norm": 0.7822239107856425, |
|
"learning_rate": 8e-05, |
|
"loss": 0.4735, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.5080469769464985, |
|
"grad_norm": 0.9159063781364695, |
|
"learning_rate": 7.999952257304926e-05, |
|
"loss": 0.4585, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.5150065245759026, |
|
"grad_norm": 1.4014617788300159, |
|
"learning_rate": 7.99980903035939e-05, |
|
"loss": 0.4817, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.5219660722053067, |
|
"grad_norm": 0.9697910698942601, |
|
"learning_rate": 7.999570322582408e-05, |
|
"loss": 0.4719, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5289256198347108, |
|
"grad_norm": 1.2780959714818068, |
|
"learning_rate": 7.99923613967226e-05, |
|
"loss": 0.4744, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.5358851674641149, |
|
"grad_norm": 0.9675381526583897, |
|
"learning_rate": 7.99880648960634e-05, |
|
"loss": 0.4704, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.542844715093519, |
|
"grad_norm": 1.047833737067459, |
|
"learning_rate": 7.998281382640975e-05, |
|
"loss": 0.4654, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.549804262722923, |
|
"grad_norm": 1.2845937442452149, |
|
"learning_rate": 7.997660831311176e-05, |
|
"loss": 0.475, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.5567638103523271, |
|
"grad_norm": 0.8772171829670746, |
|
"learning_rate": 7.996944850430339e-05, |
|
"loss": 0.4656, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5637233579817312, |
|
"grad_norm": 0.741967780268622, |
|
"learning_rate": 7.996133457089894e-05, |
|
"loss": 0.4575, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.5706829056111353, |
|
"grad_norm": 0.8708734610216243, |
|
"learning_rate": 7.99522667065889e-05, |
|
"loss": 0.4673, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.5776424532405393, |
|
"grad_norm": 0.9611160209126256, |
|
"learning_rate": 7.994224512783544e-05, |
|
"loss": 0.4644, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.5846020008699434, |
|
"grad_norm": 1.2059285045807202, |
|
"learning_rate": 7.993127007386715e-05, |
|
"loss": 0.4782, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.5915615484993475, |
|
"grad_norm": 1.0796995800628297, |
|
"learning_rate": 7.991934180667333e-05, |
|
"loss": 0.4642, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5985210961287516, |
|
"grad_norm": 1.0316521924490913, |
|
"learning_rate": 7.990646061099782e-05, |
|
"loss": 0.4646, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.6054806437581557, |
|
"grad_norm": 0.8832150277973638, |
|
"learning_rate": 7.989262679433211e-05, |
|
"loss": 0.4626, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.6124401913875598, |
|
"grad_norm": 0.7634910217249218, |
|
"learning_rate": 7.987784068690804e-05, |
|
"loss": 0.4626, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.6193997390169639, |
|
"grad_norm": 1.1086418661133017, |
|
"learning_rate": 7.986210264168991e-05, |
|
"loss": 0.4521, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.626359286646368, |
|
"grad_norm": 0.6778528235443292, |
|
"learning_rate": 7.98454130343661e-05, |
|
"loss": 0.4606, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6333188342757721, |
|
"grad_norm": 0.7098255147206154, |
|
"learning_rate": 7.982777226334e-05, |
|
"loss": 0.4546, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.6402783819051762, |
|
"grad_norm": 0.7512375219693761, |
|
"learning_rate": 7.980918074972059e-05, |
|
"loss": 0.4526, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.6472379295345803, |
|
"grad_norm": 0.4955536043933238, |
|
"learning_rate": 7.978963893731235e-05, |
|
"loss": 0.4514, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.6541974771639844, |
|
"grad_norm": 0.6854584128464718, |
|
"learning_rate": 7.976914729260468e-05, |
|
"loss": 0.4656, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.6611570247933884, |
|
"grad_norm": 0.6020857806767794, |
|
"learning_rate": 7.974770630476077e-05, |
|
"loss": 0.4539, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.6681165724227925, |
|
"grad_norm": 0.5198959190719997, |
|
"learning_rate": 7.972531648560587e-05, |
|
"loss": 0.4522, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.6750761200521966, |
|
"grad_norm": 0.8318026218834386, |
|
"learning_rate": 7.970197836961513e-05, |
|
"loss": 0.4623, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.6820356676816007, |
|
"grad_norm": 0.9109802442285713, |
|
"learning_rate": 7.967769251390083e-05, |
|
"loss": 0.4559, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.6889952153110048, |
|
"grad_norm": 1.25243965937425, |
|
"learning_rate": 7.96524594981991e-05, |
|
"loss": 0.4626, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.6959547629404089, |
|
"grad_norm": 0.868516955234305, |
|
"learning_rate": 7.9626279924856e-05, |
|
"loss": 0.4569, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.702914310569813, |
|
"grad_norm": 0.48520097270425405, |
|
"learning_rate": 7.959915441881322e-05, |
|
"loss": 0.4515, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.709873858199217, |
|
"grad_norm": 0.5480517510335293, |
|
"learning_rate": 7.957108362759316e-05, |
|
"loss": 0.4544, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.7168334058286211, |
|
"grad_norm": 0.8911184240263139, |
|
"learning_rate": 7.954206822128343e-05, |
|
"loss": 0.4635, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.7237929534580252, |
|
"grad_norm": 0.8227526938281489, |
|
"learning_rate": 7.951210889252088e-05, |
|
"loss": 0.465, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.7307525010874293, |
|
"grad_norm": 0.5558210684070918, |
|
"learning_rate": 7.948120635647503e-05, |
|
"loss": 0.4487, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.7377120487168334, |
|
"grad_norm": 0.6355162909760532, |
|
"learning_rate": 7.944936135083108e-05, |
|
"loss": 0.4523, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.7446715963462375, |
|
"grad_norm": 0.6105345680130448, |
|
"learning_rate": 7.941657463577225e-05, |
|
"loss": 0.4575, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.7516311439756416, |
|
"grad_norm": 0.5678069745935661, |
|
"learning_rate": 7.938284699396157e-05, |
|
"loss": 0.4498, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.7585906916050457, |
|
"grad_norm": 0.5483024912339128, |
|
"learning_rate": 7.934817923052331e-05, |
|
"loss": 0.4549, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.7655502392344498, |
|
"grad_norm": 0.3929806004224007, |
|
"learning_rate": 7.931257217302371e-05, |
|
"loss": 0.4504, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7725097868638539, |
|
"grad_norm": 0.5681787692060095, |
|
"learning_rate": 7.927602667145121e-05, |
|
"loss": 0.4477, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.779469334493258, |
|
"grad_norm": 0.556711524840673, |
|
"learning_rate": 7.923854359819619e-05, |
|
"loss": 0.4484, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.786428882122662, |
|
"grad_norm": 0.4138699309021785, |
|
"learning_rate": 7.92001238480301e-05, |
|
"loss": 0.447, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.7933884297520661, |
|
"grad_norm": 0.6357342110964699, |
|
"learning_rate": 7.916076833808414e-05, |
|
"loss": 0.4513, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.8003479773814702, |
|
"grad_norm": 0.8584704922958183, |
|
"learning_rate": 7.91204780078274e-05, |
|
"loss": 0.4427, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.8073075250108743, |
|
"grad_norm": 0.9871565259991888, |
|
"learning_rate": 7.907925381904432e-05, |
|
"loss": 0.4554, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.8142670726402784, |
|
"grad_norm": 1.0217097625637481, |
|
"learning_rate": 7.903709675581185e-05, |
|
"loss": 0.453, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.8212266202696825, |
|
"grad_norm": 0.7895770598500398, |
|
"learning_rate": 7.899400782447591e-05, |
|
"loss": 0.4541, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.8281861678990866, |
|
"grad_norm": 0.5874040536712771, |
|
"learning_rate": 7.894998805362737e-05, |
|
"loss": 0.4423, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.8351457155284906, |
|
"grad_norm": 0.6690541560849889, |
|
"learning_rate": 7.890503849407742e-05, |
|
"loss": 0.4519, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.8421052631578947, |
|
"grad_norm": 0.6865319922768905, |
|
"learning_rate": 7.885916021883268e-05, |
|
"loss": 0.4455, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.8490648107872988, |
|
"grad_norm": 0.5200700598023363, |
|
"learning_rate": 7.881235432306936e-05, |
|
"loss": 0.4407, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.8560243584167029, |
|
"grad_norm": 0.4344010301700919, |
|
"learning_rate": 7.876462192410727e-05, |
|
"loss": 0.4465, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.862983906046107, |
|
"grad_norm": 0.6677648559236202, |
|
"learning_rate": 7.871596416138312e-05, |
|
"loss": 0.4497, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.8699434536755111, |
|
"grad_norm": 0.4959133192072294, |
|
"learning_rate": 7.866638219642324e-05, |
|
"loss": 0.4412, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.8769030013049152, |
|
"grad_norm": 0.3302086455708142, |
|
"learning_rate": 7.861587721281598e-05, |
|
"loss": 0.4341, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.8838625489343193, |
|
"grad_norm": 0.4127575574993749, |
|
"learning_rate": 7.856445041618333e-05, |
|
"loss": 0.4403, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.8908220965637234, |
|
"grad_norm": 0.4079306482235732, |
|
"learning_rate": 7.851210303415225e-05, |
|
"loss": 0.45, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.8977816441931274, |
|
"grad_norm": 0.3629235803836879, |
|
"learning_rate": 7.845883631632527e-05, |
|
"loss": 0.4371, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.9047411918225315, |
|
"grad_norm": 0.3281071553183999, |
|
"learning_rate": 7.840465153425074e-05, |
|
"loss": 0.4342, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.9117007394519356, |
|
"grad_norm": 0.3939672640729078, |
|
"learning_rate": 7.83495499813924e-05, |
|
"loss": 0.4393, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.9186602870813397, |
|
"grad_norm": 0.4669777553758287, |
|
"learning_rate": 7.829353297309857e-05, |
|
"loss": 0.4378, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.9256198347107438, |
|
"grad_norm": 0.5191309992385434, |
|
"learning_rate": 7.823660184657075e-05, |
|
"loss": 0.4419, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.9325793823401479, |
|
"grad_norm": 0.4371861252468821, |
|
"learning_rate": 7.817875796083164e-05, |
|
"loss": 0.4442, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.939538929969552, |
|
"grad_norm": 0.6968595662983466, |
|
"learning_rate": 7.812000269669271e-05, |
|
"loss": 0.4448, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.9464984775989561, |
|
"grad_norm": 0.9915151230783144, |
|
"learning_rate": 7.806033745672132e-05, |
|
"loss": 0.4459, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.9534580252283602, |
|
"grad_norm": 1.1614516148599872, |
|
"learning_rate": 7.799976366520714e-05, |
|
"loss": 0.4458, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.9604175728577642, |
|
"grad_norm": 0.7103915477953243, |
|
"learning_rate": 7.793828276812819e-05, |
|
"loss": 0.4413, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.9673771204871683, |
|
"grad_norm": 0.6607811748107033, |
|
"learning_rate": 7.787589623311635e-05, |
|
"loss": 0.4374, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.9743366681165724, |
|
"grad_norm": 0.669847981333871, |
|
"learning_rate": 7.781260554942226e-05, |
|
"loss": 0.4452, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9812962157459765, |
|
"grad_norm": 0.47755885581992924, |
|
"learning_rate": 7.774841222787983e-05, |
|
"loss": 0.4439, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.9882557633753806, |
|
"grad_norm": 0.4723668370103968, |
|
"learning_rate": 7.768331780087017e-05, |
|
"loss": 0.4462, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.9952153110047847, |
|
"grad_norm": 0.5073519260421897, |
|
"learning_rate": 7.761732382228494e-05, |
|
"loss": 0.4406, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.0060896041757286, |
|
"grad_norm": 0.4347621708894711, |
|
"learning_rate": 7.755043186748936e-05, |
|
"loss": 0.4218, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.0130491518051328, |
|
"grad_norm": 0.4797475573338107, |
|
"learning_rate": 7.748264353328451e-05, |
|
"loss": 0.4078, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.0200086994345368, |
|
"grad_norm": 0.5279297958588075, |
|
"learning_rate": 7.741396043786929e-05, |
|
"loss": 0.4191, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.0269682470639407, |
|
"grad_norm": 0.5909819052892469, |
|
"learning_rate": 7.734438422080174e-05, |
|
"loss": 0.4168, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.033927794693345, |
|
"grad_norm": 0.6495659004492463, |
|
"learning_rate": 7.727391654295991e-05, |
|
"loss": 0.4194, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.040887342322749, |
|
"grad_norm": 0.587326465562309, |
|
"learning_rate": 7.720255908650222e-05, |
|
"loss": 0.4212, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.0478468899521531, |
|
"grad_norm": 0.45607673329416626, |
|
"learning_rate": 7.713031355482734e-05, |
|
"loss": 0.4074, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.0548064375815571, |
|
"grad_norm": 0.3531037620398402, |
|
"learning_rate": 7.705718167253345e-05, |
|
"loss": 0.4136, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.0617659852109613, |
|
"grad_norm": 0.4297476754808921, |
|
"learning_rate": 7.698316518537713e-05, |
|
"loss": 0.417, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.0687255328403653, |
|
"grad_norm": 0.6072156953171759, |
|
"learning_rate": 7.690826586023165e-05, |
|
"loss": 0.4163, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.0756850804697695, |
|
"grad_norm": 0.7760172331652697, |
|
"learning_rate": 7.683248548504486e-05, |
|
"loss": 0.4159, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.0826446280991735, |
|
"grad_norm": 0.876708332241165, |
|
"learning_rate": 7.675582586879641e-05, |
|
"loss": 0.4192, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.0896041757285777, |
|
"grad_norm": 0.9107175938318952, |
|
"learning_rate": 7.667828884145465e-05, |
|
"loss": 0.4145, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.0965637233579817, |
|
"grad_norm": 0.8653660027489845, |
|
"learning_rate": 7.65998762539329e-05, |
|
"loss": 0.4218, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.103523270987386, |
|
"grad_norm": 0.7850041910149821, |
|
"learning_rate": 7.652058997804532e-05, |
|
"loss": 0.4192, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.1104828186167899, |
|
"grad_norm": 0.5313734478672961, |
|
"learning_rate": 7.644043190646211e-05, |
|
"loss": 0.4118, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.117442366246194, |
|
"grad_norm": 0.5056894621317538, |
|
"learning_rate": 7.63594039526645e-05, |
|
"loss": 0.4143, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.124401913875598, |
|
"grad_norm": 0.766156288599041, |
|
"learning_rate": 7.627750805089888e-05, |
|
"loss": 0.4202, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.1313614615050023, |
|
"grad_norm": 0.7173262253497903, |
|
"learning_rate": 7.619474615613083e-05, |
|
"loss": 0.4085, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.1383210091344063, |
|
"grad_norm": 0.3801147393208917, |
|
"learning_rate": 7.611112024399829e-05, |
|
"loss": 0.4098, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.1452805567638102, |
|
"grad_norm": 0.459213719396466, |
|
"learning_rate": 7.602663231076445e-05, |
|
"loss": 0.4215, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.1522401043932144, |
|
"grad_norm": 0.5419991575798397, |
|
"learning_rate": 7.594128437327017e-05, |
|
"loss": 0.4154, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.1591996520226187, |
|
"grad_norm": 0.4710423907727317, |
|
"learning_rate": 7.58550784688857e-05, |
|
"loss": 0.4102, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.1661591996520226, |
|
"grad_norm": 0.3269293538096053, |
|
"learning_rate": 7.576801665546214e-05, |
|
"loss": 0.4183, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.1731187472814266, |
|
"grad_norm": 0.354719558769914, |
|
"learning_rate": 7.568010101128229e-05, |
|
"loss": 0.4083, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.1800782949108308, |
|
"grad_norm": 0.41454295254538065, |
|
"learning_rate": 7.559133363501107e-05, |
|
"loss": 0.4073, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.1870378425402348, |
|
"grad_norm": 0.41584140556124005, |
|
"learning_rate": 7.550171664564537e-05, |
|
"loss": 0.4184, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.193997390169639, |
|
"grad_norm": 0.42301367871950313, |
|
"learning_rate": 7.541125218246346e-05, |
|
"loss": 0.4129, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.200956937799043, |
|
"grad_norm": 0.28966866299551014, |
|
"learning_rate": 7.531994240497399e-05, |
|
"loss": 0.4078, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.2079164854284472, |
|
"grad_norm": 0.27574788938129696, |
|
"learning_rate": 7.52277894928644e-05, |
|
"loss": 0.4122, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.2148760330578512, |
|
"grad_norm": 0.3134277334731802, |
|
"learning_rate": 7.513479564594888e-05, |
|
"loss": 0.4105, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.2218355806872554, |
|
"grad_norm": 0.31141216371808733, |
|
"learning_rate": 7.504096308411587e-05, |
|
"loss": 0.4101, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.2287951283166594, |
|
"grad_norm": 0.309174676739755, |
|
"learning_rate": 7.494629404727506e-05, |
|
"loss": 0.4099, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.2357546759460636, |
|
"grad_norm": 0.3804013842457314, |
|
"learning_rate": 7.485079079530393e-05, |
|
"loss": 0.4065, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.2427142235754676, |
|
"grad_norm": 0.4490200434000277, |
|
"learning_rate": 7.47544556079938e-05, |
|
"loss": 0.4178, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.2496737712048718, |
|
"grad_norm": 0.5056109077740427, |
|
"learning_rate": 7.465729078499541e-05, |
|
"loss": 0.4175, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.2566333188342758, |
|
"grad_norm": 0.5728814822805163, |
|
"learning_rate": 7.455929864576402e-05, |
|
"loss": 0.4003, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.26359286646368, |
|
"grad_norm": 0.5364301922656747, |
|
"learning_rate": 7.4460481529504e-05, |
|
"loss": 0.4126, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.270552414093084, |
|
"grad_norm": 0.48400372508161793, |
|
"learning_rate": 7.436084179511315e-05, |
|
"loss": 0.4111, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.277511961722488, |
|
"grad_norm": 0.48691704971428823, |
|
"learning_rate": 7.426038182112613e-05, |
|
"loss": 0.4192, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.2844715093518921, |
|
"grad_norm": 0.4326376870735781, |
|
"learning_rate": 7.415910400565795e-05, |
|
"loss": 0.4071, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.2914310569812963, |
|
"grad_norm": 0.3854589354965239, |
|
"learning_rate": 7.405701076634649e-05, |
|
"loss": 0.4132, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.2983906046107003, |
|
"grad_norm": 0.3589751690112129, |
|
"learning_rate": 7.395410454029498e-05, |
|
"loss": 0.4141, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.3053501522401043, |
|
"grad_norm": 0.3798549380531171, |
|
"learning_rate": 7.385038778401367e-05, |
|
"loss": 0.4109, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.3123096998695085, |
|
"grad_norm": 0.5063041641106548, |
|
"learning_rate": 7.374586297336134e-05, |
|
"loss": 0.4121, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.3192692474989125, |
|
"grad_norm": 0.5499801181881762, |
|
"learning_rate": 7.364053260348603e-05, |
|
"loss": 0.4131, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.3262287951283167, |
|
"grad_norm": 0.538473535319422, |
|
"learning_rate": 7.353439918876565e-05, |
|
"loss": 0.4146, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.3331883427577207, |
|
"grad_norm": 0.5133206548316543, |
|
"learning_rate": 7.342746526274779e-05, |
|
"loss": 0.41, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.340147890387125, |
|
"grad_norm": 0.4474460367465246, |
|
"learning_rate": 7.331973337808937e-05, |
|
"loss": 0.4122, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.3471074380165289, |
|
"grad_norm": 0.44569022437909495, |
|
"learning_rate": 7.321120610649567e-05, |
|
"loss": 0.408, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.354066985645933, |
|
"grad_norm": 0.4721240876904583, |
|
"learning_rate": 7.310188603865888e-05, |
|
"loss": 0.4091, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.361026533275337, |
|
"grad_norm": 0.4670666827700711, |
|
"learning_rate": 7.299177578419634e-05, |
|
"loss": 0.4092, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.3679860809047413, |
|
"grad_norm": 0.3702037932288681, |
|
"learning_rate": 7.288087797158822e-05, |
|
"loss": 0.4097, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.3749456285341453, |
|
"grad_norm": 0.330387714721557, |
|
"learning_rate": 7.276919524811472e-05, |
|
"loss": 0.4104, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.3819051761635492, |
|
"grad_norm": 0.3875230383248599, |
|
"learning_rate": 7.265673027979295e-05, |
|
"loss": 0.4129, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.3888647237929534, |
|
"grad_norm": 0.3612694249655593, |
|
"learning_rate": 7.254348575131328e-05, |
|
"loss": 0.4106, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.3958242714223577, |
|
"grad_norm": 0.2763726751232723, |
|
"learning_rate": 7.242946436597518e-05, |
|
"loss": 0.4116, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.4027838190517616, |
|
"grad_norm": 0.37611767937513824, |
|
"learning_rate": 7.231466884562275e-05, |
|
"loss": 0.4086, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.4097433666811656, |
|
"grad_norm": 0.5071472574190108, |
|
"learning_rate": 7.21991019305798e-05, |
|
"loss": 0.411, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.4167029143105698, |
|
"grad_norm": 0.4340052816154503, |
|
"learning_rate": 7.20827663795843e-05, |
|
"loss": 0.4079, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.423662461939974, |
|
"grad_norm": 0.3712380468999662, |
|
"learning_rate": 7.19656649697226e-05, |
|
"loss": 0.4056, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.430622009569378, |
|
"grad_norm": 0.3225866140544311, |
|
"learning_rate": 7.184780049636318e-05, |
|
"loss": 0.4062, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.437581557198782, |
|
"grad_norm": 0.3106099934478539, |
|
"learning_rate": 7.172917577308984e-05, |
|
"loss": 0.4062, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.4445411048281862, |
|
"grad_norm": 0.4355158685775427, |
|
"learning_rate": 7.160979363163456e-05, |
|
"loss": 0.4142, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.4515006524575902, |
|
"grad_norm": 0.4793783049280065, |
|
"learning_rate": 7.148965692180994e-05, |
|
"loss": 0.399, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.4584602000869944, |
|
"grad_norm": 0.3726750358936686, |
|
"learning_rate": 7.136876851144113e-05, |
|
"loss": 0.4132, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.4654197477163984, |
|
"grad_norm": 0.31513705424244864, |
|
"learning_rate": 7.124713128629739e-05, |
|
"loss": 0.4058, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.4723792953458026, |
|
"grad_norm": 0.3159413572790412, |
|
"learning_rate": 7.11247481500232e-05, |
|
"loss": 0.4041, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.4793388429752066, |
|
"grad_norm": 0.33778666639216565, |
|
"learning_rate": 7.100162202406891e-05, |
|
"loss": 0.4147, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.4862983906046108, |
|
"grad_norm": 0.3268790661878625, |
|
"learning_rate": 7.08777558476211e-05, |
|
"loss": 0.4086, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.4932579382340148, |
|
"grad_norm": 0.32242776651704286, |
|
"learning_rate": 7.075315257753229e-05, |
|
"loss": 0.4148, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.500217485863419, |
|
"grad_norm": 0.31567118569033087, |
|
"learning_rate": 7.062781518825047e-05, |
|
"loss": 0.4137, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.507177033492823, |
|
"grad_norm": 0.3612184397000591, |
|
"learning_rate": 7.050174667174799e-05, |
|
"loss": 0.4097, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.514136581122227, |
|
"grad_norm": 0.4218662687852862, |
|
"learning_rate": 7.037495003745024e-05, |
|
"loss": 0.4084, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.5210961287516311, |
|
"grad_norm": 0.45144908695802316, |
|
"learning_rate": 7.024742831216374e-05, |
|
"loss": 0.4123, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.5280556763810353, |
|
"grad_norm": 0.4338699229225814, |
|
"learning_rate": 7.011918454000391e-05, |
|
"loss": 0.41, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.5350152240104393, |
|
"grad_norm": 0.4218475334427419, |
|
"learning_rate": 6.99902217823224e-05, |
|
"loss": 0.4099, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.5419747716398433, |
|
"grad_norm": 0.5384078834636578, |
|
"learning_rate": 6.986054311763402e-05, |
|
"loss": 0.4115, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.5489343192692475, |
|
"grad_norm": 0.5695524389564858, |
|
"learning_rate": 6.973015164154326e-05, |
|
"loss": 0.4057, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.5558938668986517, |
|
"grad_norm": 0.5296732996163314, |
|
"learning_rate": 6.959905046667035e-05, |
|
"loss": 0.4163, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.5628534145280557, |
|
"grad_norm": 0.5381554961428682, |
|
"learning_rate": 6.946724272257699e-05, |
|
"loss": 0.4125, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.5698129621574597, |
|
"grad_norm": 0.5541446269357019, |
|
"learning_rate": 6.933473155569165e-05, |
|
"loss": 0.4166, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.576772509786864, |
|
"grad_norm": 0.5249105733906559, |
|
"learning_rate": 6.920152012923446e-05, |
|
"loss": 0.4159, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.583732057416268, |
|
"grad_norm": 0.4702525223746907, |
|
"learning_rate": 6.906761162314165e-05, |
|
"loss": 0.4081, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.590691605045672, |
|
"grad_norm": 0.4018848611353379, |
|
"learning_rate": 6.893300923398974e-05, |
|
"loss": 0.4095, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.597651152675076, |
|
"grad_norm": 0.4387137463620228, |
|
"learning_rate": 6.879771617491912e-05, |
|
"loss": 0.4038, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.6046107003044803, |
|
"grad_norm": 0.5018137502153527, |
|
"learning_rate": 6.866173567555743e-05, |
|
"loss": 0.4007, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.6115702479338843, |
|
"grad_norm": 0.4243706106137811, |
|
"learning_rate": 6.852507098194242e-05, |
|
"loss": 0.4087, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.6185297955632882, |
|
"grad_norm": 0.3061005646641186, |
|
"learning_rate": 6.838772535644451e-05, |
|
"loss": 0.4062, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.6254893431926924, |
|
"grad_norm": 0.3094567455663344, |
|
"learning_rate": 6.824970207768882e-05, |
|
"loss": 0.4056, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.6324488908220967, |
|
"grad_norm": 0.27807602474077575, |
|
"learning_rate": 6.811100444047704e-05, |
|
"loss": 0.4026, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.6394084384515006, |
|
"grad_norm": 0.23900746560867425, |
|
"learning_rate": 6.797163575570866e-05, |
|
"loss": 0.4087, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.6463679860809046, |
|
"grad_norm": 0.29199115797045505, |
|
"learning_rate": 6.783159935030197e-05, |
|
"loss": 0.4027, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.6533275337103088, |
|
"grad_norm": 0.3184667675259614, |
|
"learning_rate": 6.76908985671147e-05, |
|
"loss": 0.4041, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.660287081339713, |
|
"grad_norm": 0.29291241249140837, |
|
"learning_rate": 6.754953676486415e-05, |
|
"loss": 0.4079, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.667246628969117, |
|
"grad_norm": 0.2840221245598121, |
|
"learning_rate": 6.740751731804699e-05, |
|
"loss": 0.4011, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.674206176598521, |
|
"grad_norm": 0.27939828055160454, |
|
"learning_rate": 6.726484361685882e-05, |
|
"loss": 0.4019, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.6811657242279252, |
|
"grad_norm": 0.256609914729704, |
|
"learning_rate": 6.712151906711314e-05, |
|
"loss": 0.4048, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.6881252718573294, |
|
"grad_norm": 0.23745107974586638, |
|
"learning_rate": 6.697754709016009e-05, |
|
"loss": 0.4058, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.6950848194867334, |
|
"grad_norm": 0.2482380115074257, |
|
"learning_rate": 6.683293112280475e-05, |
|
"loss": 0.3967, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.7020443671161374, |
|
"grad_norm": 0.28622361734552443, |
|
"learning_rate": 6.668767461722518e-05, |
|
"loss": 0.4061, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.7090039147455416, |
|
"grad_norm": 0.32770363568621996, |
|
"learning_rate": 6.654178104088987e-05, |
|
"loss": 0.4033, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.7159634623749458, |
|
"grad_norm": 0.3279702359191176, |
|
"learning_rate": 6.639525387647508e-05, |
|
"loss": 0.4059, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.7229230100043496, |
|
"grad_norm": 0.3773747339194689, |
|
"learning_rate": 6.62480966217817e-05, |
|
"loss": 0.407, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.7298825576337538, |
|
"grad_norm": 0.2998244869603879, |
|
"learning_rate": 6.610031278965168e-05, |
|
"loss": 0.4064, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.736842105263158, |
|
"grad_norm": 0.28510557083266236, |
|
"learning_rate": 6.595190590788424e-05, |
|
"loss": 0.4005, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.743801652892562, |
|
"grad_norm": 0.2932273577434916, |
|
"learning_rate": 6.580287951915163e-05, |
|
"loss": 0.4021, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.750761200521966, |
|
"grad_norm": 0.36298879202671214, |
|
"learning_rate": 6.565323718091459e-05, |
|
"loss": 0.4004, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.7577207481513701, |
|
"grad_norm": 0.3557701177490779, |
|
"learning_rate": 6.550298246533735e-05, |
|
"loss": 0.4071, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.7646802957807743, |
|
"grad_norm": 0.28338303699075845, |
|
"learning_rate": 6.535211895920247e-05, |
|
"loss": 0.4066, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.7716398434101783, |
|
"grad_norm": 0.24886657367644255, |
|
"learning_rate": 6.520065026382511e-05, |
|
"loss": 0.3955, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.7785993910395823, |
|
"grad_norm": 0.24062443339702383, |
|
"learning_rate": 6.504857999496718e-05, |
|
"loss": 0.406, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.7855589386689865, |
|
"grad_norm": 0.26443097823193684, |
|
"learning_rate": 6.489591178275087e-05, |
|
"loss": 0.4044, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.7925184862983907, |
|
"grad_norm": 0.3060220331852048, |
|
"learning_rate": 6.474264927157216e-05, |
|
"loss": 0.404, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.7994780339277947, |
|
"grad_norm": 0.32526179480394124, |
|
"learning_rate": 6.45887961200137e-05, |
|
"loss": 0.4009, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.8064375815571987, |
|
"grad_norm": 0.321679972746841, |
|
"learning_rate": 6.443435600075757e-05, |
|
"loss": 0.4056, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.813397129186603, |
|
"grad_norm": 0.3488328000864295, |
|
"learning_rate": 6.42793326004975e-05, |
|
"loss": 0.4049, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.820356676816007, |
|
"grad_norm": 0.34796885320723775, |
|
"learning_rate": 6.412372961985097e-05, |
|
"loss": 0.4048, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.827316224445411, |
|
"grad_norm": 0.36888125998311183, |
|
"learning_rate": 6.396755077327081e-05, |
|
"loss": 0.4132, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.834275772074815, |
|
"grad_norm": 0.42541614404670686, |
|
"learning_rate": 6.381079978895654e-05, |
|
"loss": 0.4026, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.8412353197042193, |
|
"grad_norm": 0.47952057555190714, |
|
"learning_rate": 6.365348040876538e-05, |
|
"loss": 0.4081, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.8481948673336233, |
|
"grad_norm": 0.456755628226278, |
|
"learning_rate": 6.349559638812292e-05, |
|
"loss": 0.4002, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.8551544149630272, |
|
"grad_norm": 0.3949978072972249, |
|
"learning_rate": 6.333715149593351e-05, |
|
"loss": 0.4048, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.8621139625924314, |
|
"grad_norm": 0.3684718285439735, |
|
"learning_rate": 6.317814951449022e-05, |
|
"loss": 0.4044, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.8690735102218357, |
|
"grad_norm": 0.28515408422430655, |
|
"learning_rate": 6.301859423938463e-05, |
|
"loss": 0.4021, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.8760330578512396, |
|
"grad_norm": 0.31596820921849417, |
|
"learning_rate": 6.285848947941612e-05, |
|
"loss": 0.3998, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.8829926054806436, |
|
"grad_norm": 0.3692222807296291, |
|
"learning_rate": 6.26978390565011e-05, |
|
"loss": 0.4061, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.8899521531100478, |
|
"grad_norm": 0.36126506035909883, |
|
"learning_rate": 6.253664680558164e-05, |
|
"loss": 0.4081, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.896911700739452, |
|
"grad_norm": 0.2273722979571688, |
|
"learning_rate": 6.237491657453396e-05, |
|
"loss": 0.4013, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.903871248368856, |
|
"grad_norm": 0.2249012460180903, |
|
"learning_rate": 6.221265222407663e-05, |
|
"loss": 0.4073, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.91083079599826, |
|
"grad_norm": 0.26278543615349254, |
|
"learning_rate": 6.204985762767835e-05, |
|
"loss": 0.3981, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.9177903436276642, |
|
"grad_norm": 0.23928206927128529, |
|
"learning_rate": 6.188653667146551e-05, |
|
"loss": 0.4005, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.9247498912570684, |
|
"grad_norm": 0.28051087087380006, |
|
"learning_rate": 6.172269325412941e-05, |
|
"loss": 0.4047, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.9317094388864724, |
|
"grad_norm": 0.2991219685083212, |
|
"learning_rate": 6.15583312868332e-05, |
|
"loss": 0.4093, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.9386689865158764, |
|
"grad_norm": 0.3242054335075169, |
|
"learning_rate": 6.139345469311855e-05, |
|
"loss": 0.4114, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.9456285341452806, |
|
"grad_norm": 0.3988144729091857, |
|
"learning_rate": 6.122806740881191e-05, |
|
"loss": 0.4081, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.9525880817746848, |
|
"grad_norm": 0.5183725945183102, |
|
"learning_rate": 6.10621733819306e-05, |
|
"loss": 0.4048, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.9595476294040888, |
|
"grad_norm": 0.6027727793849302, |
|
"learning_rate": 6.089577657258863e-05, |
|
"loss": 0.3972, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.9665071770334928, |
|
"grad_norm": 0.6004923341243484, |
|
"learning_rate": 6.0728880952902056e-05, |
|
"loss": 0.3993, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.973466724662897, |
|
"grad_norm": 0.5380310616547116, |
|
"learning_rate": 6.056149050689419e-05, |
|
"loss": 0.3982, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.980426272292301, |
|
"grad_norm": 0.3634273851516844, |
|
"learning_rate": 6.039360923040059e-05, |
|
"loss": 0.4051, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.987385819921705, |
|
"grad_norm": 0.24841398249046384, |
|
"learning_rate": 6.0225241130973506e-05, |
|
"loss": 0.4044, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.9943453675511091, |
|
"grad_norm": 0.31467866229451, |
|
"learning_rate": 6.0056390227786366e-05, |
|
"loss": 0.4052, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 2.005219660722053, |
|
"grad_norm": 0.3996733942417249, |
|
"learning_rate": 5.9887060551537774e-05, |
|
"loss": 0.3765, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 2.012179208351457, |
|
"grad_norm": 0.3755337827846155, |
|
"learning_rate": 5.971725614435529e-05, |
|
"loss": 0.367, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.0191387559808613, |
|
"grad_norm": 0.37714284296853545, |
|
"learning_rate": 5.95469810596989e-05, |
|
"loss": 0.372, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 2.0260983036102655, |
|
"grad_norm": 0.34335888494135713, |
|
"learning_rate": 5.937623936226435e-05, |
|
"loss": 0.3655, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.0330578512396693, |
|
"grad_norm": 0.3214731030718956, |
|
"learning_rate": 5.9205035127886026e-05, |
|
"loss": 0.3596, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 2.0400173988690735, |
|
"grad_norm": 0.3037344199720621, |
|
"learning_rate": 5.903337244343972e-05, |
|
"loss": 0.365, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 2.0469769464984777, |
|
"grad_norm": 0.29558276490238256, |
|
"learning_rate": 5.8861255406745e-05, |
|
"loss": 0.3655, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 2.0539364941278815, |
|
"grad_norm": 0.3239986635139212, |
|
"learning_rate": 5.8688688126467514e-05, |
|
"loss": 0.3737, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 2.0608960417572857, |
|
"grad_norm": 0.5323428158934608, |
|
"learning_rate": 5.8515674722020745e-05, |
|
"loss": 0.3691, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.06785558938669, |
|
"grad_norm": 0.6310404829026035, |
|
"learning_rate": 5.834221932346781e-05, |
|
"loss": 0.3742, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 2.074815137016094, |
|
"grad_norm": 0.4806664035734928, |
|
"learning_rate": 5.8168326071422815e-05, |
|
"loss": 0.3655, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 2.081774684645498, |
|
"grad_norm": 0.3081849802706308, |
|
"learning_rate": 5.799399911695201e-05, |
|
"loss": 0.3759, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 2.088734232274902, |
|
"grad_norm": 0.384198968174172, |
|
"learning_rate": 5.781924262147471e-05, |
|
"loss": 0.3618, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 2.0956937799043063, |
|
"grad_norm": 0.3402446023172264, |
|
"learning_rate": 5.7644060756663954e-05, |
|
"loss": 0.3706, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.1026533275337105, |
|
"grad_norm": 0.2804847723920022, |
|
"learning_rate": 5.746845770434692e-05, |
|
"loss": 0.3645, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 2.1096128751631142, |
|
"grad_norm": 0.33735637577255784, |
|
"learning_rate": 5.7292437656405094e-05, |
|
"loss": 0.3694, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 2.1165724227925184, |
|
"grad_norm": 0.29830282676341935, |
|
"learning_rate": 5.711600481467422e-05, |
|
"loss": 0.3661, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 2.1235319704219227, |
|
"grad_norm": 0.26892805785374885, |
|
"learning_rate": 5.693916339084397e-05, |
|
"loss": 0.365, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 2.130491518051327, |
|
"grad_norm": 0.2924956628530246, |
|
"learning_rate": 5.676191760635744e-05, |
|
"loss": 0.3682, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.1374510656807306, |
|
"grad_norm": 0.3322840841302584, |
|
"learning_rate": 5.6584271692310345e-05, |
|
"loss": 0.3591, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 2.144410613310135, |
|
"grad_norm": 0.31939252409902796, |
|
"learning_rate": 5.640622988935006e-05, |
|
"loss": 0.366, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 2.151370160939539, |
|
"grad_norm": 0.3675500408960479, |
|
"learning_rate": 5.6227796447574296e-05, |
|
"loss": 0.3721, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 2.1583297085689432, |
|
"grad_norm": 0.4269918712695122, |
|
"learning_rate": 5.604897562642979e-05, |
|
"loss": 0.3691, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 2.165289256198347, |
|
"grad_norm": 0.32932381192451604, |
|
"learning_rate": 5.58697716946105e-05, |
|
"loss": 0.3695, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.172248803827751, |
|
"grad_norm": 0.23672816365722077, |
|
"learning_rate": 5.5690188929955756e-05, |
|
"loss": 0.3718, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 2.1792083514571554, |
|
"grad_norm": 0.2685885863825375, |
|
"learning_rate": 5.5510231619348154e-05, |
|
"loss": 0.3626, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.186167899086559, |
|
"grad_norm": 0.31894739343841294, |
|
"learning_rate": 5.5329904058611195e-05, |
|
"loss": 0.3696, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.1931274467159634, |
|
"grad_norm": 0.2690402159463182, |
|
"learning_rate": 5.514921055240674e-05, |
|
"loss": 0.3664, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.2000869943453676, |
|
"grad_norm": 0.36597319341247037, |
|
"learning_rate": 5.4968155414132294e-05, |
|
"loss": 0.3661, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.207046541974772, |
|
"grad_norm": 0.2509160139038948, |
|
"learning_rate": 5.4786742965817964e-05, |
|
"loss": 0.3737, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.2140060896041756, |
|
"grad_norm": 0.22135896086783516, |
|
"learning_rate": 5.4604977538023375e-05, |
|
"loss": 0.3651, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 2.2209656372335798, |
|
"grad_norm": 0.21117176336652171, |
|
"learning_rate": 5.442286346973419e-05, |
|
"loss": 0.3694, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.227925184862984, |
|
"grad_norm": 0.21238366241650145, |
|
"learning_rate": 5.424040510825867e-05, |
|
"loss": 0.3724, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 2.234884732492388, |
|
"grad_norm": 0.18338158702931512, |
|
"learning_rate": 5.405760680912374e-05, |
|
"loss": 0.3706, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.241844280121792, |
|
"grad_norm": 0.21207069234590878, |
|
"learning_rate": 5.387447293597113e-05, |
|
"loss": 0.3612, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.248803827751196, |
|
"grad_norm": 0.2196618454401931, |
|
"learning_rate": 5.3691007860453185e-05, |
|
"loss": 0.3706, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.2557633753806003, |
|
"grad_norm": 0.28738964583045895, |
|
"learning_rate": 5.3507215962128485e-05, |
|
"loss": 0.3665, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 2.2627229230100045, |
|
"grad_norm": 0.2285542763691958, |
|
"learning_rate": 5.332310162835729e-05, |
|
"loss": 0.371, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.2696824706394083, |
|
"grad_norm": 0.25316534475803026, |
|
"learning_rate": 5.313866925419685e-05, |
|
"loss": 0.368, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.2766420182688125, |
|
"grad_norm": 0.30304672804412763, |
|
"learning_rate": 5.295392324229648e-05, |
|
"loss": 0.3681, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 2.2836015658982167, |
|
"grad_norm": 0.31451674332345964, |
|
"learning_rate": 5.276886800279243e-05, |
|
"loss": 0.367, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 2.2905611135276205, |
|
"grad_norm": 0.3565352812109271, |
|
"learning_rate": 5.2583507953202654e-05, |
|
"loss": 0.3689, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 2.2975206611570247, |
|
"grad_norm": 0.2608640444128353, |
|
"learning_rate": 5.239784751832128e-05, |
|
"loss": 0.3708, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 2.304480208786429, |
|
"grad_norm": 0.17931678523987182, |
|
"learning_rate": 5.221189113011309e-05, |
|
"loss": 0.3681, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.311439756415833, |
|
"grad_norm": 0.24872188995030328, |
|
"learning_rate": 5.2025643227607656e-05, |
|
"loss": 0.366, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 2.3183993040452373, |
|
"grad_norm": 0.2145763484568399, |
|
"learning_rate": 5.18391082567934e-05, |
|
"loss": 0.3608, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 2.325358851674641, |
|
"grad_norm": 0.2164912354682701, |
|
"learning_rate": 5.1652290670511396e-05, |
|
"loss": 0.3715, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 2.3323183993040453, |
|
"grad_norm": 0.2248326733820592, |
|
"learning_rate": 5.1465194928349215e-05, |
|
"loss": 0.3723, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 2.3392779469334495, |
|
"grad_norm": 0.23611123081541333, |
|
"learning_rate": 5.127782549653431e-05, |
|
"loss": 0.368, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.3462374945628532, |
|
"grad_norm": 0.27552200066945187, |
|
"learning_rate": 5.1090186847827535e-05, |
|
"loss": 0.3681, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.3531970421922574, |
|
"grad_norm": 0.2473752934953281, |
|
"learning_rate": 5.090228346141626e-05, |
|
"loss": 0.3705, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 2.3601565898216617, |
|
"grad_norm": 0.1964697404165815, |
|
"learning_rate": 5.071411982280754e-05, |
|
"loss": 0.3694, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 2.367116137451066, |
|
"grad_norm": 0.18565136863582019, |
|
"learning_rate": 5.0525700423720964e-05, |
|
"loss": 0.3676, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 2.3740756850804696, |
|
"grad_norm": 0.2127557318833584, |
|
"learning_rate": 5.033702976198154e-05, |
|
"loss": 0.3652, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.381035232709874, |
|
"grad_norm": 0.22768416565210678, |
|
"learning_rate": 5.0148112341412155e-05, |
|
"loss": 0.3627, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 2.387994780339278, |
|
"grad_norm": 0.20405791268703083, |
|
"learning_rate": 4.9958952671726214e-05, |
|
"loss": 0.3645, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 2.394954327968682, |
|
"grad_norm": 0.18709352719221842, |
|
"learning_rate": 4.976955526841995e-05, |
|
"loss": 0.3744, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 2.401913875598086, |
|
"grad_norm": 0.2257696533468009, |
|
"learning_rate": 4.9579924652664624e-05, |
|
"loss": 0.3659, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 2.40887342322749, |
|
"grad_norm": 0.22579970351795095, |
|
"learning_rate": 4.939006535119851e-05, |
|
"loss": 0.3721, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.4158329708568944, |
|
"grad_norm": 0.1695044273561281, |
|
"learning_rate": 4.919998189621902e-05, |
|
"loss": 0.3717, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 2.4227925184862986, |
|
"grad_norm": 0.1868306496768378, |
|
"learning_rate": 4.9009678825274344e-05, |
|
"loss": 0.37, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 2.4297520661157024, |
|
"grad_norm": 0.203208513247505, |
|
"learning_rate": 4.8819160681155245e-05, |
|
"loss": 0.3687, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.4367116137451066, |
|
"grad_norm": 0.18225921597742265, |
|
"learning_rate": 4.8628432011786536e-05, |
|
"loss": 0.3722, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 2.443671161374511, |
|
"grad_norm": 0.19704873995226552, |
|
"learning_rate": 4.843749737011858e-05, |
|
"loss": 0.3767, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.4506307090039146, |
|
"grad_norm": 0.17968261581935796, |
|
"learning_rate": 4.8246361314018566e-05, |
|
"loss": 0.3674, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 2.4575902566333188, |
|
"grad_norm": 0.16849328618057585, |
|
"learning_rate": 4.805502840616171e-05, |
|
"loss": 0.3676, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.464549804262723, |
|
"grad_norm": 0.17392306484481443, |
|
"learning_rate": 4.786350321392237e-05, |
|
"loss": 0.3598, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 2.471509351892127, |
|
"grad_norm": 0.15047667342669488, |
|
"learning_rate": 4.767179030926492e-05, |
|
"loss": 0.3626, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 2.478468899521531, |
|
"grad_norm": 0.18466737228109523, |
|
"learning_rate": 4.7479894268634794e-05, |
|
"loss": 0.3644, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.485428447150935, |
|
"grad_norm": 0.19259024117568577, |
|
"learning_rate": 4.728781967284904e-05, |
|
"loss": 0.3666, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 2.4923879947803393, |
|
"grad_norm": 0.1658133386266871, |
|
"learning_rate": 4.7095571106987096e-05, |
|
"loss": 0.3706, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 2.4993475424097435, |
|
"grad_norm": 0.1857293176933773, |
|
"learning_rate": 4.6903153160281266e-05, |
|
"loss": 0.3658, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 2.5063070900391473, |
|
"grad_norm": 0.1780806265365307, |
|
"learning_rate": 4.671057042600728e-05, |
|
"loss": 0.37, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 2.5132666376685515, |
|
"grad_norm": 0.18451738808389523, |
|
"learning_rate": 4.6517827501374466e-05, |
|
"loss": 0.367, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.5202261852979557, |
|
"grad_norm": 0.19964181415078697, |
|
"learning_rate": 4.632492898741619e-05, |
|
"loss": 0.3679, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 2.52718573292736, |
|
"grad_norm": 0.20121807599019395, |
|
"learning_rate": 4.61318794888799e-05, |
|
"loss": 0.3664, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 2.5341452805567637, |
|
"grad_norm": 0.1711607561791567, |
|
"learning_rate": 4.593868361411729e-05, |
|
"loss": 0.3719, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 2.541104828186168, |
|
"grad_norm": 0.20920063909952305, |
|
"learning_rate": 4.57453459749742e-05, |
|
"loss": 0.3677, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 2.548064375815572, |
|
"grad_norm": 0.20148754034737523, |
|
"learning_rate": 4.555187118668064e-05, |
|
"loss": 0.3715, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.555023923444976, |
|
"grad_norm": 0.16950935825604083, |
|
"learning_rate": 4.53582638677405e-05, |
|
"loss": 0.3668, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.56198347107438, |
|
"grad_norm": 0.18835333573041302, |
|
"learning_rate": 4.516452863982138e-05, |
|
"loss": 0.3642, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 2.5689430187037843, |
|
"grad_norm": 0.19215610553252904, |
|
"learning_rate": 4.497067012764423e-05, |
|
"loss": 0.3691, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 2.5759025663331885, |
|
"grad_norm": 0.14122968503331657, |
|
"learning_rate": 4.477669295887299e-05, |
|
"loss": 0.3682, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 2.5828621139625927, |
|
"grad_norm": 0.21501799625798765, |
|
"learning_rate": 4.458260176400404e-05, |
|
"loss": 0.3778, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.5898216615919964, |
|
"grad_norm": 0.19997905382170258, |
|
"learning_rate": 4.4388401176255765e-05, |
|
"loss": 0.367, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 2.5967812092214007, |
|
"grad_norm": 0.16385556465949214, |
|
"learning_rate": 4.419409583145787e-05, |
|
"loss": 0.3671, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 2.603740756850805, |
|
"grad_norm": 0.21677818126974885, |
|
"learning_rate": 4.3999690367940796e-05, |
|
"loss": 0.3685, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 2.6107003044802086, |
|
"grad_norm": 0.20763772842399864, |
|
"learning_rate": 4.3805189426424895e-05, |
|
"loss": 0.3637, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 2.617659852109613, |
|
"grad_norm": 0.15620005918797172, |
|
"learning_rate": 4.361059764990977e-05, |
|
"loss": 0.3612, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.624619399739017, |
|
"grad_norm": 0.21129929527164582, |
|
"learning_rate": 4.341591968356332e-05, |
|
"loss": 0.36, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 2.6315789473684212, |
|
"grad_norm": 0.2263848905453181, |
|
"learning_rate": 4.322116017461094e-05, |
|
"loss": 0.367, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 2.638538494997825, |
|
"grad_norm": 0.19202782447376043, |
|
"learning_rate": 4.3026323772224564e-05, |
|
"loss": 0.3741, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 2.645498042627229, |
|
"grad_norm": 0.21118865625881889, |
|
"learning_rate": 4.283141512741168e-05, |
|
"loss": 0.3613, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 2.6524575902566334, |
|
"grad_norm": 0.22052270512412744, |
|
"learning_rate": 4.263643889290425e-05, |
|
"loss": 0.3772, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.659417137886037, |
|
"grad_norm": 0.22167433442606896, |
|
"learning_rate": 4.244139972304775e-05, |
|
"loss": 0.3679, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 2.6663766855154414, |
|
"grad_norm": 0.1649098717811062, |
|
"learning_rate": 4.224630227368998e-05, |
|
"loss": 0.37, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 2.6733362331448456, |
|
"grad_norm": 0.16002894116004632, |
|
"learning_rate": 4.2051151202069976e-05, |
|
"loss": 0.3687, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 2.68029578077425, |
|
"grad_norm": 0.20116234602993452, |
|
"learning_rate": 4.1855951166706783e-05, |
|
"loss": 0.3658, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 2.687255328403654, |
|
"grad_norm": 0.1867737945734333, |
|
"learning_rate": 4.166070682728826e-05, |
|
"loss": 0.3636, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.6942148760330578, |
|
"grad_norm": 0.20649674616661776, |
|
"learning_rate": 4.1465422844559914e-05, |
|
"loss": 0.369, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 2.701174423662462, |
|
"grad_norm": 0.2019593756587488, |
|
"learning_rate": 4.127010388021355e-05, |
|
"loss": 0.3707, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 2.708133971291866, |
|
"grad_norm": 0.18670876706954212, |
|
"learning_rate": 4.1074754596776076e-05, |
|
"loss": 0.3723, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 2.71509351892127, |
|
"grad_norm": 0.1760226934500404, |
|
"learning_rate": 4.087937965749808e-05, |
|
"loss": 0.3652, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 2.722053066550674, |
|
"grad_norm": 0.19645887620326913, |
|
"learning_rate": 4.068398372624268e-05, |
|
"loss": 0.3599, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.7290126141800783, |
|
"grad_norm": 0.19578274938214618, |
|
"learning_rate": 4.0488571467374035e-05, |
|
"loss": 0.3581, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 2.7359721618094826, |
|
"grad_norm": 0.18492474565139713, |
|
"learning_rate": 4.02931475456461e-05, |
|
"loss": 0.3685, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 2.7429317094388863, |
|
"grad_norm": 0.19837302215210767, |
|
"learning_rate": 4.009771662609122e-05, |
|
"loss": 0.3672, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 2.7498912570682905, |
|
"grad_norm": 0.19983845339321854, |
|
"learning_rate": 3.990228337390879e-05, |
|
"loss": 0.37, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 2.7568508046976947, |
|
"grad_norm": 0.21784022283680937, |
|
"learning_rate": 3.970685245435391e-05, |
|
"loss": 0.3654, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.7638103523270985, |
|
"grad_norm": 0.1710051588696074, |
|
"learning_rate": 3.951142853262598e-05, |
|
"loss": 0.3672, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 2.7707698999565027, |
|
"grad_norm": 0.1838813552978722, |
|
"learning_rate": 3.931601627375733e-05, |
|
"loss": 0.3657, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 2.777729447585907, |
|
"grad_norm": 0.16862615564852854, |
|
"learning_rate": 3.9120620342501934e-05, |
|
"loss": 0.3638, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 2.784688995215311, |
|
"grad_norm": 0.1799106397970807, |
|
"learning_rate": 3.8925245403223944e-05, |
|
"loss": 0.3643, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 2.7916485428447153, |
|
"grad_norm": 0.15331361673144303, |
|
"learning_rate": 3.872989611978644e-05, |
|
"loss": 0.3629, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.798608090474119, |
|
"grad_norm": 0.15271887998704453, |
|
"learning_rate": 3.85345771554401e-05, |
|
"loss": 0.3652, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 2.8055676381035233, |
|
"grad_norm": 0.15620132253760882, |
|
"learning_rate": 3.833929317271175e-05, |
|
"loss": 0.3602, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 2.8125271857329275, |
|
"grad_norm": 0.15541759415132128, |
|
"learning_rate": 3.814404883329324e-05, |
|
"loss": 0.3696, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 2.8194867333623312, |
|
"grad_norm": 0.1614291271877618, |
|
"learning_rate": 3.794884879793004e-05, |
|
"loss": 0.3657, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 2.8264462809917354, |
|
"grad_norm": 0.13456071859787921, |
|
"learning_rate": 3.7753697726310026e-05, |
|
"loss": 0.3646, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.8334058286211397, |
|
"grad_norm": 0.1451353124821423, |
|
"learning_rate": 3.755860027695225e-05, |
|
"loss": 0.3706, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 2.840365376250544, |
|
"grad_norm": 0.14928014331184716, |
|
"learning_rate": 3.7363561107095765e-05, |
|
"loss": 0.3677, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 2.847324923879948, |
|
"grad_norm": 0.13937655853284261, |
|
"learning_rate": 3.7168584872588336e-05, |
|
"loss": 0.3642, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 2.854284471509352, |
|
"grad_norm": 0.16336855650022625, |
|
"learning_rate": 3.697367622777545e-05, |
|
"loss": 0.3632, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 2.861244019138756, |
|
"grad_norm": 0.1545871776685994, |
|
"learning_rate": 3.677883982538907e-05, |
|
"loss": 0.3703, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.86820356676816, |
|
"grad_norm": 0.14861237645334818, |
|
"learning_rate": 3.6584080316436696e-05, |
|
"loss": 0.3632, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 2.875163114397564, |
|
"grad_norm": 0.1386742246058214, |
|
"learning_rate": 3.638940235009025e-05, |
|
"loss": 0.3691, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 2.882122662026968, |
|
"grad_norm": 0.1496262020436169, |
|
"learning_rate": 3.619481057357511e-05, |
|
"loss": 0.3649, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 2.8890822096563724, |
|
"grad_norm": 0.1274326314422379, |
|
"learning_rate": 3.600030963205922e-05, |
|
"loss": 0.3702, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 2.8960417572857766, |
|
"grad_norm": 0.14829018850055273, |
|
"learning_rate": 3.580590416854214e-05, |
|
"loss": 0.3641, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 2.9030013049151804, |
|
"grad_norm": 0.15397049410838218, |
|
"learning_rate": 3.561159882374425e-05, |
|
"loss": 0.3655, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 2.9099608525445846, |
|
"grad_norm": 0.149564207070461, |
|
"learning_rate": 3.541739823599598e-05, |
|
"loss": 0.3638, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 2.916920400173989, |
|
"grad_norm": 0.14607395090587116, |
|
"learning_rate": 3.5223307041127025e-05, |
|
"loss": 0.3675, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 2.9238799478033926, |
|
"grad_norm": 0.17923632050969646, |
|
"learning_rate": 3.502932987235577e-05, |
|
"loss": 0.369, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 2.9308394954327968, |
|
"grad_norm": 0.1538306766022074, |
|
"learning_rate": 3.4835471360178626e-05, |
|
"loss": 0.369, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.937799043062201, |
|
"grad_norm": 0.17364066496444236, |
|
"learning_rate": 3.464173613225951e-05, |
|
"loss": 0.3678, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 2.944758590691605, |
|
"grad_norm": 0.14792158024656513, |
|
"learning_rate": 3.4448128813319365e-05, |
|
"loss": 0.3706, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 2.9517181383210094, |
|
"grad_norm": 0.16703857041888837, |
|
"learning_rate": 3.425465402502581e-05, |
|
"loss": 0.3668, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 2.958677685950413, |
|
"grad_norm": 0.16247636991051975, |
|
"learning_rate": 3.406131638588273e-05, |
|
"loss": 0.3613, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 2.9656372335798173, |
|
"grad_norm": 0.14211396104389656, |
|
"learning_rate": 3.386812051112011e-05, |
|
"loss": 0.3678, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.9725967812092216, |
|
"grad_norm": 0.1664600329507516, |
|
"learning_rate": 3.367507101258382e-05, |
|
"loss": 0.359, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 2.9795563288386253, |
|
"grad_norm": 0.15867128125681806, |
|
"learning_rate": 3.348217249862555e-05, |
|
"loss": 0.3749, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 2.9865158764680295, |
|
"grad_norm": 0.12942815057080284, |
|
"learning_rate": 3.328942957399274e-05, |
|
"loss": 0.3692, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 2.9934754240974337, |
|
"grad_norm": 0.14130545610018397, |
|
"learning_rate": 3.309684683971874e-05, |
|
"loss": 0.3673, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 3.0043497172683775, |
|
"grad_norm": 0.16060728870914467, |
|
"learning_rate": 3.2904428893012924e-05, |
|
"loss": 0.3474, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.0113092648977817, |
|
"grad_norm": 0.1671503265033675, |
|
"learning_rate": 3.2712180327150965e-05, |
|
"loss": 0.3352, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 3.018268812527186, |
|
"grad_norm": 0.17602527650846453, |
|
"learning_rate": 3.252010573136521e-05, |
|
"loss": 0.3334, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 3.0252283601565897, |
|
"grad_norm": 0.19388540072595992, |
|
"learning_rate": 3.2328209690735085e-05, |
|
"loss": 0.3368, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 3.032187907785994, |
|
"grad_norm": 0.18871272314861076, |
|
"learning_rate": 3.213649678607765e-05, |
|
"loss": 0.3276, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 3.039147455415398, |
|
"grad_norm": 0.1825751881244417, |
|
"learning_rate": 3.19449715938383e-05, |
|
"loss": 0.3264, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 3.046107003044802, |
|
"grad_norm": 0.20536550174110904, |
|
"learning_rate": 3.175363868598145e-05, |
|
"loss": 0.3336, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 3.053066550674206, |
|
"grad_norm": 0.19028693715972822, |
|
"learning_rate": 3.1562502629881435e-05, |
|
"loss": 0.3361, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 3.0600260983036103, |
|
"grad_norm": 0.19144645634840593, |
|
"learning_rate": 3.137156798821347e-05, |
|
"loss": 0.3295, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 3.0669856459330145, |
|
"grad_norm": 0.15458500965875127, |
|
"learning_rate": 3.118083931884477e-05, |
|
"loss": 0.3325, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 3.0739451935624182, |
|
"grad_norm": 0.16190332070850957, |
|
"learning_rate": 3.099032117472567e-05, |
|
"loss": 0.324, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.0809047411918224, |
|
"grad_norm": 0.1931582699954326, |
|
"learning_rate": 3.0800018103780997e-05, |
|
"loss": 0.3319, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 3.0878642888212267, |
|
"grad_norm": 0.15544825033730889, |
|
"learning_rate": 3.060993464880151e-05, |
|
"loss": 0.3312, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 3.094823836450631, |
|
"grad_norm": 0.19712552027556793, |
|
"learning_rate": 3.0420075347335403e-05, |
|
"loss": 0.3358, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 3.1017833840800346, |
|
"grad_norm": 0.16525888661776322, |
|
"learning_rate": 3.023044473158004e-05, |
|
"loss": 0.3286, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 3.108742931709439, |
|
"grad_norm": 0.16251875749806674, |
|
"learning_rate": 3.0041047328273786e-05, |
|
"loss": 0.3371, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 3.115702479338843, |
|
"grad_norm": 0.14251069293212443, |
|
"learning_rate": 2.9851887658587865e-05, |
|
"loss": 0.3323, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 3.1226620269682472, |
|
"grad_norm": 0.14225759957337078, |
|
"learning_rate": 2.9662970238018472e-05, |
|
"loss": 0.3323, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 3.129621574597651, |
|
"grad_norm": 0.12972258276234128, |
|
"learning_rate": 2.947429957627904e-05, |
|
"loss": 0.3289, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 3.136581122227055, |
|
"grad_norm": 0.13682940985467507, |
|
"learning_rate": 2.9285880177192475e-05, |
|
"loss": 0.3265, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 3.1435406698564594, |
|
"grad_norm": 0.12277970656857606, |
|
"learning_rate": 2.9097716538583746e-05, |
|
"loss": 0.3282, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.1505002174858636, |
|
"grad_norm": 0.1543558782856671, |
|
"learning_rate": 2.8909813152172472e-05, |
|
"loss": 0.3335, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 3.1574597651152674, |
|
"grad_norm": 0.14314974220458332, |
|
"learning_rate": 2.8722174503465697e-05, |
|
"loss": 0.3367, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 3.1644193127446716, |
|
"grad_norm": 0.133141477333654, |
|
"learning_rate": 2.8534805071650802e-05, |
|
"loss": 0.3306, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 3.171378860374076, |
|
"grad_norm": 0.15733593589203457, |
|
"learning_rate": 2.834770932948862e-05, |
|
"loss": 0.334, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 3.17833840800348, |
|
"grad_norm": 0.14584273626489633, |
|
"learning_rate": 2.816089174320663e-05, |
|
"loss": 0.3325, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 3.1852979556328838, |
|
"grad_norm": 0.14481726585440186, |
|
"learning_rate": 2.7974356772392347e-05, |
|
"loss": 0.3381, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 3.192257503262288, |
|
"grad_norm": 0.16331363108391558, |
|
"learning_rate": 2.7788108869886917e-05, |
|
"loss": 0.334, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 3.199217050891692, |
|
"grad_norm": 0.11664377872886357, |
|
"learning_rate": 2.7602152481678726e-05, |
|
"loss": 0.3308, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 3.206176598521096, |
|
"grad_norm": 0.14495158600624636, |
|
"learning_rate": 2.741649204679736e-05, |
|
"loss": 0.3336, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 3.2131361461505, |
|
"grad_norm": 0.1229178497126909, |
|
"learning_rate": 2.723113199720757e-05, |
|
"loss": 0.3386, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.2200956937799043, |
|
"grad_norm": 0.12926743116724196, |
|
"learning_rate": 2.7046076757703524e-05, |
|
"loss": 0.3358, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 3.2270552414093086, |
|
"grad_norm": 0.1313312996235397, |
|
"learning_rate": 2.6861330745803167e-05, |
|
"loss": 0.3397, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 3.2340147890387123, |
|
"grad_norm": 0.1309779972337031, |
|
"learning_rate": 2.6676898371642726e-05, |
|
"loss": 0.3338, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 3.2409743366681165, |
|
"grad_norm": 0.11811653221059722, |
|
"learning_rate": 2.6492784037871532e-05, |
|
"loss": 0.3316, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 3.2479338842975207, |
|
"grad_norm": 0.1434333839230034, |
|
"learning_rate": 2.6308992139546825e-05, |
|
"loss": 0.3348, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 3.254893431926925, |
|
"grad_norm": 0.11122333012963419, |
|
"learning_rate": 2.6125527064028874e-05, |
|
"loss": 0.3351, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 3.2618529795563287, |
|
"grad_norm": 0.1403831175014317, |
|
"learning_rate": 2.5942393190876268e-05, |
|
"loss": 0.3301, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 3.268812527185733, |
|
"grad_norm": 0.11875350967381776, |
|
"learning_rate": 2.5759594891741345e-05, |
|
"loss": 0.3361, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 3.275772074815137, |
|
"grad_norm": 0.1352653946139823, |
|
"learning_rate": 2.55771365302658e-05, |
|
"loss": 0.3293, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 3.2827316224445413, |
|
"grad_norm": 0.12259046772867968, |
|
"learning_rate": 2.539502246197663e-05, |
|
"loss": 0.3317, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.289691170073945, |
|
"grad_norm": 0.1341784005406857, |
|
"learning_rate": 2.5213257034182042e-05, |
|
"loss": 0.3336, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 3.2966507177033493, |
|
"grad_norm": 0.13864288710375855, |
|
"learning_rate": 2.503184458586772e-05, |
|
"loss": 0.3368, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 3.3036102653327535, |
|
"grad_norm": 0.13118987129584506, |
|
"learning_rate": 2.4850789447593276e-05, |
|
"loss": 0.3367, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 3.3105698129621572, |
|
"grad_norm": 0.1377873050994517, |
|
"learning_rate": 2.4670095941388822e-05, |
|
"loss": 0.3388, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 3.3175293605915614, |
|
"grad_norm": 0.12322175327795074, |
|
"learning_rate": 2.4489768380651856e-05, |
|
"loss": 0.3333, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 3.3244889082209657, |
|
"grad_norm": 0.13741800125839718, |
|
"learning_rate": 2.4309811070044247e-05, |
|
"loss": 0.3327, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 3.33144845585037, |
|
"grad_norm": 0.11467772477556636, |
|
"learning_rate": 2.4130228305389514e-05, |
|
"loss": 0.329, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 3.3384080034797736, |
|
"grad_norm": 0.13098647128626187, |
|
"learning_rate": 2.3951024373570214e-05, |
|
"loss": 0.3373, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 3.345367551109178, |
|
"grad_norm": 0.10605964176743833, |
|
"learning_rate": 2.3772203552425717e-05, |
|
"loss": 0.3276, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 3.352327098738582, |
|
"grad_norm": 0.1369975822039289, |
|
"learning_rate": 2.3593770110649966e-05, |
|
"loss": 0.3287, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.3592866463679862, |
|
"grad_norm": 0.10746583370109318, |
|
"learning_rate": 2.341572830768965e-05, |
|
"loss": 0.3247, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 3.36624619399739, |
|
"grad_norm": 0.12619643852285148, |
|
"learning_rate": 2.323808239364256e-05, |
|
"loss": 0.3334, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 3.373205741626794, |
|
"grad_norm": 0.11299951719427818, |
|
"learning_rate": 2.306083660915604e-05, |
|
"loss": 0.3314, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 3.3801652892561984, |
|
"grad_norm": 0.12698843914556834, |
|
"learning_rate": 2.2883995185325797e-05, |
|
"loss": 0.3269, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 3.3871248368856026, |
|
"grad_norm": 0.11590705576436211, |
|
"learning_rate": 2.2707562343594916e-05, |
|
"loss": 0.3378, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 3.3940843845150064, |
|
"grad_norm": 0.12503790075980037, |
|
"learning_rate": 2.2531542295653094e-05, |
|
"loss": 0.336, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 3.4010439321444106, |
|
"grad_norm": 0.11611542174394521, |
|
"learning_rate": 2.235593924333607e-05, |
|
"loss": 0.3347, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 3.408003479773815, |
|
"grad_norm": 0.12221901254166186, |
|
"learning_rate": 2.21807573785253e-05, |
|
"loss": 0.3333, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 3.4149630274032186, |
|
"grad_norm": 0.12496755906276005, |
|
"learning_rate": 2.2006000883048008e-05, |
|
"loss": 0.331, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 3.4219225750326228, |
|
"grad_norm": 0.1160790840797104, |
|
"learning_rate": 2.183167392857719e-05, |
|
"loss": 0.3347, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.428882122662027, |
|
"grad_norm": 0.12271399113397469, |
|
"learning_rate": 2.1657780676532205e-05, |
|
"loss": 0.3371, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 3.435841670291431, |
|
"grad_norm": 0.11513590243752209, |
|
"learning_rate": 2.1484325277979278e-05, |
|
"loss": 0.3336, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 3.4428012179208354, |
|
"grad_norm": 0.13194006381727105, |
|
"learning_rate": 2.1311311873532502e-05, |
|
"loss": 0.3346, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 3.449760765550239, |
|
"grad_norm": 0.10982920455618568, |
|
"learning_rate": 2.1138744593254997e-05, |
|
"loss": 0.3304, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 3.4567203131796433, |
|
"grad_norm": 0.1485401459853815, |
|
"learning_rate": 2.09666275565603e-05, |
|
"loss": 0.3296, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 3.4636798608090476, |
|
"grad_norm": 0.12077637074742696, |
|
"learning_rate": 2.0794964872113987e-05, |
|
"loss": 0.3354, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 3.4706394084384513, |
|
"grad_norm": 0.12161585081850207, |
|
"learning_rate": 2.062376063773567e-05, |
|
"loss": 0.3273, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 3.4775989560678555, |
|
"grad_norm": 0.11631137052693763, |
|
"learning_rate": 2.045301894030111e-05, |
|
"loss": 0.3358, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 3.4845585036972597, |
|
"grad_norm": 0.12349941669465997, |
|
"learning_rate": 2.0282743855644727e-05, |
|
"loss": 0.3297, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 3.491518051326664, |
|
"grad_norm": 0.10080665328455962, |
|
"learning_rate": 2.011293944846222e-05, |
|
"loss": 0.3322, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.4984775989560677, |
|
"grad_norm": 0.12452596754190881, |
|
"learning_rate": 1.994360977221364e-05, |
|
"loss": 0.3378, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 3.505437146585472, |
|
"grad_norm": 0.10178078398456272, |
|
"learning_rate": 1.97747588690265e-05, |
|
"loss": 0.3254, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 3.512396694214876, |
|
"grad_norm": 0.11961132752342465, |
|
"learning_rate": 1.9606390769599426e-05, |
|
"loss": 0.3325, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 3.51935624184428, |
|
"grad_norm": 0.10380768215954524, |
|
"learning_rate": 1.9438509493105816e-05, |
|
"loss": 0.3301, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 3.526315789473684, |
|
"grad_norm": 0.10192138810793511, |
|
"learning_rate": 1.9271119047097967e-05, |
|
"loss": 0.3343, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 3.5332753371030883, |
|
"grad_norm": 0.11375055774791937, |
|
"learning_rate": 1.910422342741136e-05, |
|
"loss": 0.3354, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 3.5402348847324925, |
|
"grad_norm": 0.09713201709606016, |
|
"learning_rate": 1.8937826618069396e-05, |
|
"loss": 0.3326, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 3.5471944323618967, |
|
"grad_norm": 0.11482174079205225, |
|
"learning_rate": 1.8771932591188106e-05, |
|
"loss": 0.3358, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 3.5541539799913004, |
|
"grad_norm": 0.1085601568655917, |
|
"learning_rate": 1.860654530688147e-05, |
|
"loss": 0.3316, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 3.5611135276207047, |
|
"grad_norm": 0.11763399705418416, |
|
"learning_rate": 1.84416687131668e-05, |
|
"loss": 0.3296, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 3.568073075250109, |
|
"grad_norm": 0.11136798232627028, |
|
"learning_rate": 1.8277306745870605e-05, |
|
"loss": 0.3328, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 3.5750326228795126, |
|
"grad_norm": 0.12672402635855642, |
|
"learning_rate": 1.811346332853451e-05, |
|
"loss": 0.332, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 3.581992170508917, |
|
"grad_norm": 0.11623196365691729, |
|
"learning_rate": 1.7950142372321658e-05, |
|
"loss": 0.332, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 3.588951718138321, |
|
"grad_norm": 0.107315476806856, |
|
"learning_rate": 1.778734777592337e-05, |
|
"loss": 0.3317, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 3.5959112657677252, |
|
"grad_norm": 0.11801835716954666, |
|
"learning_rate": 1.7625083425466044e-05, |
|
"loss": 0.3339, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 3.6028708133971294, |
|
"grad_norm": 0.0966733318424344, |
|
"learning_rate": 1.746335319441838e-05, |
|
"loss": 0.3254, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 3.609830361026533, |
|
"grad_norm": 0.11771321695825486, |
|
"learning_rate": 1.7302160943498916e-05, |
|
"loss": 0.3354, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 3.6167899086559374, |
|
"grad_norm": 0.10460865568553186, |
|
"learning_rate": 1.7141510520583887e-05, |
|
"loss": 0.3305, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 3.6237494562853416, |
|
"grad_norm": 0.11996000392740622, |
|
"learning_rate": 1.698140576061538e-05, |
|
"loss": 0.339, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 3.6307090039147454, |
|
"grad_norm": 0.09741048167997303, |
|
"learning_rate": 1.6821850485509784e-05, |
|
"loss": 0.3366, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.6376685515441496, |
|
"grad_norm": 0.1142703570198178, |
|
"learning_rate": 1.6662848504066502e-05, |
|
"loss": 0.3337, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 3.644628099173554, |
|
"grad_norm": 0.09709378850935774, |
|
"learning_rate": 1.6504403611877098e-05, |
|
"loss": 0.3322, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 3.651587646802958, |
|
"grad_norm": 0.10615913057072311, |
|
"learning_rate": 1.6346519591234637e-05, |
|
"loss": 0.3325, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 3.6585471944323618, |
|
"grad_norm": 0.1156860445622765, |
|
"learning_rate": 1.6189200211043484e-05, |
|
"loss": 0.3347, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 3.665506742061766, |
|
"grad_norm": 0.10972351654483692, |
|
"learning_rate": 1.6032449226729195e-05, |
|
"loss": 0.3354, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 3.67246628969117, |
|
"grad_norm": 0.12167997828635133, |
|
"learning_rate": 1.5876270380149038e-05, |
|
"loss": 0.3371, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 3.679425837320574, |
|
"grad_norm": 0.1227635833803301, |
|
"learning_rate": 1.57206673995025e-05, |
|
"loss": 0.3303, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 3.686385384949978, |
|
"grad_norm": 0.10671146343573032, |
|
"learning_rate": 1.556564399924244e-05, |
|
"loss": 0.3301, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 3.6933449325793823, |
|
"grad_norm": 0.11109924997469728, |
|
"learning_rate": 1.541120387998631e-05, |
|
"loss": 0.3295, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 3.7003044802087866, |
|
"grad_norm": 0.10312763393789623, |
|
"learning_rate": 1.5257350728427862e-05, |
|
"loss": 0.3361, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 3.7072640278381908, |
|
"grad_norm": 0.10629651651459636, |
|
"learning_rate": 1.5104088217249132e-05, |
|
"loss": 0.3321, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 3.7142235754675945, |
|
"grad_norm": 0.09725532964037208, |
|
"learning_rate": 1.4951420005032828e-05, |
|
"loss": 0.3379, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 3.7211831230969987, |
|
"grad_norm": 0.11511675877236498, |
|
"learning_rate": 1.4799349736174891e-05, |
|
"loss": 0.3307, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 3.728142670726403, |
|
"grad_norm": 0.09824052462354282, |
|
"learning_rate": 1.4647881040797547e-05, |
|
"loss": 0.3273, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 3.7351022183558067, |
|
"grad_norm": 0.10324613912974012, |
|
"learning_rate": 1.4497017534662651e-05, |
|
"loss": 0.3344, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 3.742061765985211, |
|
"grad_norm": 0.10581522915369096, |
|
"learning_rate": 1.4346762819085424e-05, |
|
"loss": 0.3342, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 3.749021313614615, |
|
"grad_norm": 0.10101793752872175, |
|
"learning_rate": 1.4197120480848381e-05, |
|
"loss": 0.3348, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 3.7559808612440193, |
|
"grad_norm": 0.10390774039864187, |
|
"learning_rate": 1.4048094092115774e-05, |
|
"loss": 0.3301, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 3.762940408873423, |
|
"grad_norm": 0.10773321694134558, |
|
"learning_rate": 1.389968721034833e-05, |
|
"loss": 0.3353, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 3.7698999565028273, |
|
"grad_norm": 0.09221835790298968, |
|
"learning_rate": 1.3751903378218315e-05, |
|
"loss": 0.3329, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 3.7768595041322315, |
|
"grad_norm": 0.10820822350883358, |
|
"learning_rate": 1.3604746123524932e-05, |
|
"loss": 0.3278, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 3.7838190517616352, |
|
"grad_norm": 0.10470082245347596, |
|
"learning_rate": 1.3458218959110152e-05, |
|
"loss": 0.3371, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 3.7907785993910394, |
|
"grad_norm": 0.10207051909220886, |
|
"learning_rate": 1.3312325382774827e-05, |
|
"loss": 0.3371, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 3.7977381470204437, |
|
"grad_norm": 0.10101453592738402, |
|
"learning_rate": 1.3167068877195237e-05, |
|
"loss": 0.3265, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 3.804697694649848, |
|
"grad_norm": 0.09285174719990112, |
|
"learning_rate": 1.3022452909839918e-05, |
|
"loss": 0.3277, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 3.811657242279252, |
|
"grad_norm": 0.1059908590081114, |
|
"learning_rate": 1.2878480932886874e-05, |
|
"loss": 0.334, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 3.818616789908656, |
|
"grad_norm": 0.10635953000158242, |
|
"learning_rate": 1.2735156383141187e-05, |
|
"loss": 0.3325, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 3.82557633753806, |
|
"grad_norm": 0.0985299908106167, |
|
"learning_rate": 1.2592482681953025e-05, |
|
"loss": 0.3317, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 3.8325358851674642, |
|
"grad_norm": 0.10588236607635328, |
|
"learning_rate": 1.2450463235135874e-05, |
|
"loss": 0.34, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 3.839495432796868, |
|
"grad_norm": 0.10316866804543555, |
|
"learning_rate": 1.2309101432885302e-05, |
|
"loss": 0.3347, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.846454980426272, |
|
"grad_norm": 0.09961037740811082, |
|
"learning_rate": 1.2168400649698039e-05, |
|
"loss": 0.3351, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 3.8534145280556764, |
|
"grad_norm": 0.09817204131281733, |
|
"learning_rate": 1.202836424429135e-05, |
|
"loss": 0.3365, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 3.8603740756850806, |
|
"grad_norm": 0.09481914220388948, |
|
"learning_rate": 1.1888995559522974e-05, |
|
"loss": 0.3292, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 3.867333623314485, |
|
"grad_norm": 0.10082172771379438, |
|
"learning_rate": 1.1750297922311193e-05, |
|
"loss": 0.3335, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 3.8742931709438886, |
|
"grad_norm": 0.08986399832050056, |
|
"learning_rate": 1.1612274643555504e-05, |
|
"loss": 0.3284, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 3.881252718573293, |
|
"grad_norm": 0.08986088415045988, |
|
"learning_rate": 1.1474929018057574e-05, |
|
"loss": 0.3345, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 3.8882122662026966, |
|
"grad_norm": 0.09403102777418895, |
|
"learning_rate": 1.1338264324442573e-05, |
|
"loss": 0.3315, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 3.8951718138321008, |
|
"grad_norm": 0.09262927753397106, |
|
"learning_rate": 1.1202283825080884e-05, |
|
"loss": 0.3282, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 3.902131361461505, |
|
"grad_norm": 0.0934685977047065, |
|
"learning_rate": 1.1066990766010274e-05, |
|
"loss": 0.3337, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 3.909090909090909, |
|
"grad_norm": 0.09165216810312578, |
|
"learning_rate": 1.093238837685835e-05, |
|
"loss": 0.3318, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 3.9160504567203134, |
|
"grad_norm": 0.09756546339699468, |
|
"learning_rate": 1.0798479870765558e-05, |
|
"loss": 0.3282, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 3.923010004349717, |
|
"grad_norm": 0.09062420344108967, |
|
"learning_rate": 1.0665268444308366e-05, |
|
"loss": 0.3305, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 3.9299695519791213, |
|
"grad_norm": 0.09673675762215994, |
|
"learning_rate": 1.0532757277423019e-05, |
|
"loss": 0.3291, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 3.9369290996085256, |
|
"grad_norm": 0.0933424618907197, |
|
"learning_rate": 1.0400949533329653e-05, |
|
"loss": 0.3414, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 3.9438886472379293, |
|
"grad_norm": 0.08961372593224715, |
|
"learning_rate": 1.0269848358456743e-05, |
|
"loss": 0.3262, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 3.9508481948673335, |
|
"grad_norm": 0.1180626958636412, |
|
"learning_rate": 1.0139456882365981e-05, |
|
"loss": 0.3379, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 3.9578077424967377, |
|
"grad_norm": 0.08706924109298092, |
|
"learning_rate": 1.0009778217677617e-05, |
|
"loss": 0.3356, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 3.964767290126142, |
|
"grad_norm": 0.09192179644041273, |
|
"learning_rate": 9.880815459996102e-06, |
|
"loss": 0.3353, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 3.971726837755546, |
|
"grad_norm": 0.08744584106099203, |
|
"learning_rate": 9.752571687836267e-06, |
|
"loss": 0.3275, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 3.97868638538495, |
|
"grad_norm": 0.09171104289414916, |
|
"learning_rate": 9.625049962549768e-06, |
|
"loss": 0.3334, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 3.985645933014354, |
|
"grad_norm": 0.09202503387494353, |
|
"learning_rate": 9.498253328252023e-06, |
|
"loss": 0.3311, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 3.9926054806437583, |
|
"grad_norm": 0.09047931089059162, |
|
"learning_rate": 9.372184811749544e-06, |
|
"loss": 0.3316, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 4.003479773814702, |
|
"grad_norm": 0.11221100584378, |
|
"learning_rate": 9.246847422467718e-06, |
|
"loss": 0.3252, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 4.010439321444106, |
|
"grad_norm": 0.14511168880726213, |
|
"learning_rate": 9.122244152378919e-06, |
|
"loss": 0.3121, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 4.01739886907351, |
|
"grad_norm": 0.11527962568493744, |
|
"learning_rate": 8.998377975931096e-06, |
|
"loss": 0.3038, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 4.024358416702914, |
|
"grad_norm": 0.10476073099305397, |
|
"learning_rate": 8.875251849976823e-06, |
|
"loss": 0.3086, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 4.0313179643323185, |
|
"grad_norm": 0.12889927756242409, |
|
"learning_rate": 8.752868713702617e-06, |
|
"loss": 0.3109, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 4.038277511961723, |
|
"grad_norm": 0.1290059820672822, |
|
"learning_rate": 8.63123148855888e-06, |
|
"loss": 0.3054, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 4.045237059591127, |
|
"grad_norm": 0.12434412735412093, |
|
"learning_rate": 8.510343078190075e-06, |
|
"loss": 0.3147, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 4.052196607220531, |
|
"grad_norm": 0.10748206301685143, |
|
"learning_rate": 8.39020636836545e-06, |
|
"loss": 0.3075, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 4.059156154849934, |
|
"grad_norm": 0.10972499765580897, |
|
"learning_rate": 8.270824226910163e-06, |
|
"loss": 0.3078, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 4.066115702479339, |
|
"grad_norm": 0.11672965349240796, |
|
"learning_rate": 8.152199503636819e-06, |
|
"loss": 0.3108, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 4.073075250108743, |
|
"grad_norm": 0.11513790902472179, |
|
"learning_rate": 8.034335030277406e-06, |
|
"loss": 0.3034, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 4.080034797738147, |
|
"grad_norm": 0.10714140673258672, |
|
"learning_rate": 7.917233620415716e-06, |
|
"loss": 0.3101, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 4.086994345367551, |
|
"grad_norm": 0.10443849628188653, |
|
"learning_rate": 7.800898069420203e-06, |
|
"loss": 0.3119, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 4.0939538929969554, |
|
"grad_norm": 0.10270537031965189, |
|
"learning_rate": 7.685331154377254e-06, |
|
"loss": 0.3108, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 4.10091344062636, |
|
"grad_norm": 0.10002934787588738, |
|
"learning_rate": 7.570535634024847e-06, |
|
"loss": 0.3116, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 4.107872988255763, |
|
"grad_norm": 0.1020168199335099, |
|
"learning_rate": 7.456514248686737e-06, |
|
"loss": 0.313, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 4.114832535885167, |
|
"grad_norm": 0.09475662900974856, |
|
"learning_rate": 7.343269720207051e-06, |
|
"loss": 0.3187, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 4.121792083514571, |
|
"grad_norm": 0.09367174029086878, |
|
"learning_rate": 7.2308047518852895e-06, |
|
"loss": 0.3054, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 4.128751631143976, |
|
"grad_norm": 0.09991070813606238, |
|
"learning_rate": 7.119122028411798e-06, |
|
"loss": 0.3094, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 4.13571117877338, |
|
"grad_norm": 0.0971805873557556, |
|
"learning_rate": 7.008224215803672e-06, |
|
"loss": 0.3149, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 4.142670726402784, |
|
"grad_norm": 0.0921029731957292, |
|
"learning_rate": 6.898113961341128e-06, |
|
"loss": 0.3101, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 4.149630274032188, |
|
"grad_norm": 0.09523088486551998, |
|
"learning_rate": 6.788793893504335e-06, |
|
"loss": 0.3052, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 4.156589821661592, |
|
"grad_norm": 0.09591717117697106, |
|
"learning_rate": 6.680266621910632e-06, |
|
"loss": 0.3096, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 4.163549369290996, |
|
"grad_norm": 0.09084622178996993, |
|
"learning_rate": 6.5725347372522204e-06, |
|
"loss": 0.3137, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 4.1705089169204, |
|
"grad_norm": 0.08687253646904729, |
|
"learning_rate": 6.465600811234356e-06, |
|
"loss": 0.3108, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 4.177468464549804, |
|
"grad_norm": 0.09181114682155883, |
|
"learning_rate": 6.3594673965139675e-06, |
|
"loss": 0.3079, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 4.184428012179208, |
|
"grad_norm": 0.09294790321001072, |
|
"learning_rate": 6.254137026638676e-06, |
|
"loss": 0.3063, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 4.1913875598086126, |
|
"grad_norm": 0.08710789978827478, |
|
"learning_rate": 6.149612215986334e-06, |
|
"loss": 0.3067, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.198347107438017, |
|
"grad_norm": 0.08713529476536094, |
|
"learning_rate": 6.045895459705042e-06, |
|
"loss": 0.3106, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 4.205306655067421, |
|
"grad_norm": 0.09233253347150959, |
|
"learning_rate": 5.94298923365352e-06, |
|
"loss": 0.3075, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 4.212266202696824, |
|
"grad_norm": 0.08771080578385054, |
|
"learning_rate": 5.840895994342068e-06, |
|
"loss": 0.3115, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 4.2192257503262285, |
|
"grad_norm": 0.08701575520285076, |
|
"learning_rate": 5.7396181788738735e-06, |
|
"loss": 0.3115, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 4.226185297955633, |
|
"grad_norm": 0.08851880459624953, |
|
"learning_rate": 5.639158204886861e-06, |
|
"loss": 0.3135, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 4.233144845585037, |
|
"grad_norm": 0.09269071200856169, |
|
"learning_rate": 5.539518470495991e-06, |
|
"loss": 0.3122, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 4.240104393214441, |
|
"grad_norm": 0.08855364356761067, |
|
"learning_rate": 5.440701354235995e-06, |
|
"loss": 0.3064, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 4.247063940843845, |
|
"grad_norm": 0.08628014488530013, |
|
"learning_rate": 5.3427092150045975e-06, |
|
"loss": 0.3075, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 4.2540234884732495, |
|
"grad_norm": 0.09140056125384614, |
|
"learning_rate": 5.24554439200621e-06, |
|
"loss": 0.3094, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 4.260983036102654, |
|
"grad_norm": 0.09432054790072364, |
|
"learning_rate": 5.149209204696073e-06, |
|
"loss": 0.3129, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 4.267942583732057, |
|
"grad_norm": 0.08230546321550815, |
|
"learning_rate": 5.05370595272495e-06, |
|
"loss": 0.3129, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 4.274902131361461, |
|
"grad_norm": 0.08826471384141378, |
|
"learning_rate": 4.959036915884134e-06, |
|
"loss": 0.3176, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 4.2818616789908654, |
|
"grad_norm": 0.08473384022161361, |
|
"learning_rate": 4.865204354051129e-06, |
|
"loss": 0.3031, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 4.28882122662027, |
|
"grad_norm": 0.09157501033682776, |
|
"learning_rate": 4.7722105071356065e-06, |
|
"loss": 0.3083, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 4.295780774249674, |
|
"grad_norm": 0.08590767447738862, |
|
"learning_rate": 4.68005759502602e-06, |
|
"loss": 0.3089, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 4.302740321879078, |
|
"grad_norm": 0.0828024048843273, |
|
"learning_rate": 4.588747817536563e-06, |
|
"loss": 0.3157, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 4.309699869508482, |
|
"grad_norm": 0.08180623225652858, |
|
"learning_rate": 4.498283354354654e-06, |
|
"loss": 0.3049, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 4.3166594171378865, |
|
"grad_norm": 0.08341811378979368, |
|
"learning_rate": 4.408666364988938e-06, |
|
"loss": 0.3146, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 4.32361896476729, |
|
"grad_norm": 0.08072676966602048, |
|
"learning_rate": 4.31989898871771e-06, |
|
"loss": 0.3121, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 4.330578512396694, |
|
"grad_norm": 0.07811857676852015, |
|
"learning_rate": 4.231983344537875e-06, |
|
"loss": 0.3056, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 4.337538060026098, |
|
"grad_norm": 0.07963712695352229, |
|
"learning_rate": 4.144921531114317e-06, |
|
"loss": 0.3092, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 4.344497607655502, |
|
"grad_norm": 0.08106713806356201, |
|
"learning_rate": 4.058715626729837e-06, |
|
"loss": 0.3087, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 4.351457155284907, |
|
"grad_norm": 0.0825669290953596, |
|
"learning_rate": 3.973367689235548e-06, |
|
"loss": 0.3124, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 4.358416702914311, |
|
"grad_norm": 0.08346710243966363, |
|
"learning_rate": 3.888879756001726e-06, |
|
"loss": 0.3097, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 4.365376250543715, |
|
"grad_norm": 0.07952645915127651, |
|
"learning_rate": 3.805253843869179e-06, |
|
"loss": 0.3082, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 4.372335798173118, |
|
"grad_norm": 0.07932372481956639, |
|
"learning_rate": 3.72249194910113e-06, |
|
"loss": 0.3172, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 4.3792953458025226, |
|
"grad_norm": 0.07754167774238424, |
|
"learning_rate": 3.6405960473355183e-06, |
|
"loss": 0.3082, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 4.386254893431927, |
|
"grad_norm": 0.08136667422744384, |
|
"learning_rate": 3.5595680935378972e-06, |
|
"loss": 0.3098, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 4.393214441061331, |
|
"grad_norm": 0.08485744461035595, |
|
"learning_rate": 3.4794100219546967e-06, |
|
"loss": 0.3132, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 4.400173988690735, |
|
"grad_norm": 0.07736893733783809, |
|
"learning_rate": 3.400123746067099e-06, |
|
"loss": 0.3057, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 4.407133536320139, |
|
"grad_norm": 0.07691011304383329, |
|
"learning_rate": 3.321711158545351e-06, |
|
"loss": 0.3092, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 4.414093083949544, |
|
"grad_norm": 0.07876827881942446, |
|
"learning_rate": 3.2441741312036014e-06, |
|
"loss": 0.309, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 4.421052631578947, |
|
"grad_norm": 0.08089462204978613, |
|
"learning_rate": 3.167514514955157e-06, |
|
"loss": 0.3105, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 4.428012179208351, |
|
"grad_norm": 0.07721348081474763, |
|
"learning_rate": 3.0917341397683633e-06, |
|
"loss": 0.3071, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 4.434971726837755, |
|
"grad_norm": 0.07932348813099326, |
|
"learning_rate": 3.0168348146228842e-06, |
|
"loss": 0.3099, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 4.4419312744671595, |
|
"grad_norm": 0.08123974569538568, |
|
"learning_rate": 2.942818327466559e-06, |
|
"loss": 0.3102, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 4.448890822096564, |
|
"grad_norm": 0.0805284681969405, |
|
"learning_rate": 2.8696864451726614e-06, |
|
"loss": 0.3167, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 4.455850369725968, |
|
"grad_norm": 0.08047835245862313, |
|
"learning_rate": 2.79744091349778e-06, |
|
"loss": 0.3076, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 4.462809917355372, |
|
"grad_norm": 0.08129909351756673, |
|
"learning_rate": 2.7260834570400986e-06, |
|
"loss": 0.3124, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 4.469769464984776, |
|
"grad_norm": 0.07839675778245354, |
|
"learning_rate": 2.6556157791982707e-06, |
|
"loss": 0.3079, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 4.4767290126141805, |
|
"grad_norm": 0.08062781084505634, |
|
"learning_rate": 2.586039562130722e-06, |
|
"loss": 0.3047, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 4.483688560243584, |
|
"grad_norm": 0.0789139952040182, |
|
"learning_rate": 2.5173564667155015e-06, |
|
"loss": 0.3117, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 4.490648107872988, |
|
"grad_norm": 0.07782883382645976, |
|
"learning_rate": 2.4495681325106535e-06, |
|
"loss": 0.3086, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 4.497607655502392, |
|
"grad_norm": 0.07695405845578089, |
|
"learning_rate": 2.3826761777150643e-06, |
|
"loss": 0.3075, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 4.5045672031317965, |
|
"grad_norm": 0.0776536074585525, |
|
"learning_rate": 2.3166821991298384e-06, |
|
"loss": 0.3116, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 4.511526750761201, |
|
"grad_norm": 0.07992836130626074, |
|
"learning_rate": 2.2515877721201697e-06, |
|
"loss": 0.313, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 4.518486298390605, |
|
"grad_norm": 0.07576525240054546, |
|
"learning_rate": 2.1873944505777447e-06, |
|
"loss": 0.3097, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 4.525445846020009, |
|
"grad_norm": 0.08056446914044152, |
|
"learning_rate": 2.124103766883661e-06, |
|
"loss": 0.3093, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 4.532405393649412, |
|
"grad_norm": 0.07912041219537566, |
|
"learning_rate": 2.0617172318718205e-06, |
|
"loss": 0.3109, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 4.539364941278817, |
|
"grad_norm": 0.0748575568058156, |
|
"learning_rate": 2.000236334792871e-06, |
|
"loss": 0.306, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 4.546324488908221, |
|
"grad_norm": 0.07495506939333019, |
|
"learning_rate": 1.9396625432786866e-06, |
|
"loss": 0.308, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 4.553284036537625, |
|
"grad_norm": 0.07516098899508962, |
|
"learning_rate": 1.879997303307297e-06, |
|
"loss": 0.3132, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 4.560243584167029, |
|
"grad_norm": 0.0760900687853293, |
|
"learning_rate": 1.8212420391683761e-06, |
|
"loss": 0.312, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 4.5672031317964334, |
|
"grad_norm": 0.07405500093323297, |
|
"learning_rate": 1.7633981534292565e-06, |
|
"loss": 0.3101, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 4.574162679425838, |
|
"grad_norm": 0.07510600135021905, |
|
"learning_rate": 1.7064670269014306e-06, |
|
"loss": 0.3065, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 4.581122227055241, |
|
"grad_norm": 0.07279725597347841, |
|
"learning_rate": 1.65045001860761e-06, |
|
"loss": 0.3083, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 4.588081774684645, |
|
"grad_norm": 0.07713401251486195, |
|
"learning_rate": 1.5953484657492734e-06, |
|
"loss": 0.3129, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 4.595041322314049, |
|
"grad_norm": 0.07573166128441056, |
|
"learning_rate": 1.5411636836747357e-06, |
|
"loss": 0.3111, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 4.602000869943454, |
|
"grad_norm": 0.07751898750624993, |
|
"learning_rate": 1.4878969658477505e-06, |
|
"loss": 0.3151, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 4.608960417572858, |
|
"grad_norm": 0.07530355421456504, |
|
"learning_rate": 1.435549583816669e-06, |
|
"loss": 0.3081, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 4.615919965202262, |
|
"grad_norm": 0.07642926242619338, |
|
"learning_rate": 1.3841227871840278e-06, |
|
"loss": 0.3133, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 4.622879512831666, |
|
"grad_norm": 0.07705536264479143, |
|
"learning_rate": 1.3336178035767612e-06, |
|
"loss": 0.3094, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 4.62983906046107, |
|
"grad_norm": 0.07420116640392391, |
|
"learning_rate": 1.2840358386168972e-06, |
|
"loss": 0.3038, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 4.636798608090475, |
|
"grad_norm": 0.07614616846884953, |
|
"learning_rate": 1.2353780758927347e-06, |
|
"loss": 0.311, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 4.643758155719878, |
|
"grad_norm": 0.08312089631794453, |
|
"learning_rate": 1.1876456769306554e-06, |
|
"loss": 0.3124, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 4.650717703349282, |
|
"grad_norm": 0.07487938485965528, |
|
"learning_rate": 1.1408397811673376e-06, |
|
"loss": 0.3105, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 4.657677250978686, |
|
"grad_norm": 0.07301485090244404, |
|
"learning_rate": 1.0949615059225871e-06, |
|
"loss": 0.3039, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 4.6646367986080906, |
|
"grad_norm": 0.07618631980688859, |
|
"learning_rate": 1.0500119463726467e-06, |
|
"loss": 0.3147, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 4.671596346237495, |
|
"grad_norm": 0.07505804558510307, |
|
"learning_rate": 1.0059921755240797e-06, |
|
"loss": 0.3114, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 4.678555893866899, |
|
"grad_norm": 0.07677226952718405, |
|
"learning_rate": 9.62903244188147e-07, |
|
"loss": 0.312, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 4.685515441496303, |
|
"grad_norm": 0.07748382882066837, |
|
"learning_rate": 9.207461809556872e-07, |
|
"loss": 0.3115, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 4.6924749891257065, |
|
"grad_norm": 0.07605829792945587, |
|
"learning_rate": 8.795219921726139e-07, |
|
"loss": 0.3122, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 4.699434536755111, |
|
"grad_norm": 0.07387581425092563, |
|
"learning_rate": 8.392316619158669e-07, |
|
"loss": 0.3074, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 4.706394084384515, |
|
"grad_norm": 0.07420555319544671, |
|
"learning_rate": 7.998761519699205e-07, |
|
"loss": 0.3107, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 4.713353632013919, |
|
"grad_norm": 0.07466571827662617, |
|
"learning_rate": 7.61456401803824e-07, |
|
"loss": 0.314, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 4.720313179643323, |
|
"grad_norm": 0.0731954788870046, |
|
"learning_rate": 7.239733285487882e-07, |
|
"loss": 0.3053, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 4.7272727272727275, |
|
"grad_norm": 0.07374103750191568, |
|
"learning_rate": 6.874278269762924e-07, |
|
"loss": 0.3098, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 4.734232274902132, |
|
"grad_norm": 0.07181151311493585, |
|
"learning_rate": 6.518207694766965e-07, |
|
"loss": 0.3111, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 4.741191822531535, |
|
"grad_norm": 0.0731098529031706, |
|
"learning_rate": 6.171530060384445e-07, |
|
"loss": 0.3057, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 4.748151370160939, |
|
"grad_norm": 0.07345303292289396, |
|
"learning_rate": 5.834253642277655e-07, |
|
"loss": 0.3085, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 4.7551109177903434, |
|
"grad_norm": 0.07333048475918498, |
|
"learning_rate": 5.506386491689197e-07, |
|
"loss": 0.307, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 4.762070465419748, |
|
"grad_norm": 0.07552740911471753, |
|
"learning_rate": 5.187936435249796e-07, |
|
"loss": 0.3086, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 4.769030013049152, |
|
"grad_norm": 0.07273500457357926, |
|
"learning_rate": 4.878911074791371e-07, |
|
"loss": 0.3121, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 4.775989560678556, |
|
"grad_norm": 0.0738654801444934, |
|
"learning_rate": 4.57931778716576e-07, |
|
"loss": 0.3062, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 4.78294910830796, |
|
"grad_norm": 0.07194970207986003, |
|
"learning_rate": 4.2891637240684234e-07, |
|
"loss": 0.3119, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 4.789908655937364, |
|
"grad_norm": 0.0747422813518781, |
|
"learning_rate": 4.0084558118678173e-07, |
|
"loss": 0.3081, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 4.796868203566768, |
|
"grad_norm": 0.07338451912151427, |
|
"learning_rate": 3.7372007514401063e-07, |
|
"loss": 0.3114, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 4.803827751196172, |
|
"grad_norm": 0.07345199197033869, |
|
"learning_rate": 3.4754050180090704e-07, |
|
"loss": 0.3016, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 4.810787298825576, |
|
"grad_norm": 0.07227607987729133, |
|
"learning_rate": 3.223074860991693e-07, |
|
"loss": 0.3113, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 4.81774684645498, |
|
"grad_norm": 0.07278601000745541, |
|
"learning_rate": 2.980216303848815e-07, |
|
"loss": 0.3082, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 4.824706394084385, |
|
"grad_norm": 0.07611641795774764, |
|
"learning_rate": 2.746835143941473e-07, |
|
"loss": 0.3076, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 4.831665941713789, |
|
"grad_norm": 0.07190187291197621, |
|
"learning_rate": 2.5229369523923853e-07, |
|
"loss": 0.3079, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 4.838625489343193, |
|
"grad_norm": 0.0739457194295076, |
|
"learning_rate": 2.3085270739531706e-07, |
|
"loss": 0.3095, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 4.845585036972597, |
|
"grad_norm": 0.07353977111327974, |
|
"learning_rate": 2.1036106268765398e-07, |
|
"loss": 0.3087, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 4.8525445846020006, |
|
"grad_norm": 0.07274021557955242, |
|
"learning_rate": 1.908192502794215e-07, |
|
"loss": 0.3104, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 4.859504132231405, |
|
"grad_norm": 0.07265017310099392, |
|
"learning_rate": 1.7222773666001336e-07, |
|
"loss": 0.3113, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 4.866463679860809, |
|
"grad_norm": 0.0713230748017762, |
|
"learning_rate": 1.545869656339072e-07, |
|
"loss": 0.3086, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 4.873423227490213, |
|
"grad_norm": 0.07442042787119083, |
|
"learning_rate": 1.3789735831009064e-07, |
|
"loss": 0.3124, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 4.880382775119617, |
|
"grad_norm": 0.0737173934874928, |
|
"learning_rate": 1.2215931309197626e-07, |
|
"loss": 0.3092, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 4.887342322749022, |
|
"grad_norm": 0.07197265372516874, |
|
"learning_rate": 1.0737320566790221e-07, |
|
"loss": 0.311, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.894301870378426, |
|
"grad_norm": 0.07270890000366599, |
|
"learning_rate": 9.35393890021885e-08, |
|
"loss": 0.3135, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 4.901261418007829, |
|
"grad_norm": 0.07164493485234402, |
|
"learning_rate": 8.065819332667702e-08, |
|
"loss": 0.3063, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 4.908220965637233, |
|
"grad_norm": 0.07345553241814261, |
|
"learning_rate": 6.872992613286223e-08, |
|
"loss": 0.311, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 4.9151805132666375, |
|
"grad_norm": 0.0738265738394693, |
|
"learning_rate": 5.775487216456377e-08, |
|
"loss": 0.3123, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 4.922140060896042, |
|
"grad_norm": 0.07288034052473, |
|
"learning_rate": 4.7733293411105216e-08, |
|
"loss": 0.3155, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 4.929099608525446, |
|
"grad_norm": 0.0763072402074743, |
|
"learning_rate": 3.8665429101070185e-08, |
|
"loss": 0.3044, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 4.93605915615485, |
|
"grad_norm": 0.07435037595475792, |
|
"learning_rate": 3.055149569660909e-08, |
|
"loss": 0.3121, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 4.943018703784254, |
|
"grad_norm": 0.07118427621968816, |
|
"learning_rate": 2.3391686888238894e-08, |
|
"loss": 0.3123, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 4.949978251413658, |
|
"grad_norm": 0.07221686597134709, |
|
"learning_rate": 1.7186173590251208e-08, |
|
"loss": 0.3037, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 4.956937799043062, |
|
"grad_norm": 0.07211936196974679, |
|
"learning_rate": 1.1935103936600023e-08, |
|
"loss": 0.3109, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 4.963897346672466, |
|
"grad_norm": 0.07300879682236437, |
|
"learning_rate": 7.63860327740229e-09, |
|
"loss": 0.315, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 4.97085689430187, |
|
"grad_norm": 0.0746678008435265, |
|
"learning_rate": 4.296774175918117e-09, |
|
"loss": 0.3114, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 4.9778164419312745, |
|
"grad_norm": 0.07379213535477022, |
|
"learning_rate": 1.909696406103834e-09, |
|
"loss": 0.3132, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 4.984775989560679, |
|
"grad_norm": 0.072449627565001, |
|
"learning_rate": 4.77426950733495e-10, |
|
"loss": 0.3053, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 4.991735537190083, |
|
"grad_norm": 0.07321298642770968, |
|
"learning_rate": 0.0, |
|
"loss": 0.3097, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 4.991735537190083, |
|
"step": 715, |
|
"total_flos": 1.839907874248065e+19, |
|
"train_loss": 0.3850101018285418, |
|
"train_runtime": 71630.1527, |
|
"train_samples_per_second": 5.133, |
|
"train_steps_per_second": 0.01 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 715, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.839907874248065e+19, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|