|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.966974900924702, |
|
"eval_steps": 500, |
|
"global_step": 470, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.010568031704095112, |
|
"grad_norm": 7.083772151723263, |
|
"learning_rate": 1.7021276595744682e-06, |
|
"loss": 1.1953, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.021136063408190225, |
|
"grad_norm": 7.131321562784017, |
|
"learning_rate": 3.4042553191489363e-06, |
|
"loss": 1.1964, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.031704095112285335, |
|
"grad_norm": 7.0823963229107525, |
|
"learning_rate": 5.106382978723404e-06, |
|
"loss": 1.2092, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.04227212681638045, |
|
"grad_norm": 5.141917283385601, |
|
"learning_rate": 6.808510638297873e-06, |
|
"loss": 1.1295, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.052840158520475564, |
|
"grad_norm": 2.56300415286007, |
|
"learning_rate": 8.510638297872341e-06, |
|
"loss": 1.077, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.06340819022457067, |
|
"grad_norm": 4.615211256351488, |
|
"learning_rate": 1.0212765957446808e-05, |
|
"loss": 1.0452, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.07397622192866579, |
|
"grad_norm": 4.783764272721008, |
|
"learning_rate": 1.1914893617021277e-05, |
|
"loss": 1.0356, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0845442536327609, |
|
"grad_norm": 3.8154182115223816, |
|
"learning_rate": 1.3617021276595745e-05, |
|
"loss": 0.9744, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.095112285336856, |
|
"grad_norm": 3.8637000758137723, |
|
"learning_rate": 1.5319148936170214e-05, |
|
"loss": 0.9714, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.10568031704095113, |
|
"grad_norm": 2.366476473063925, |
|
"learning_rate": 1.7021276595744682e-05, |
|
"loss": 0.9574, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.11624834874504623, |
|
"grad_norm": 2.994957368837667, |
|
"learning_rate": 1.872340425531915e-05, |
|
"loss": 0.9216, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.12681638044914134, |
|
"grad_norm": 2.6457297468520014, |
|
"learning_rate": 2.0425531914893616e-05, |
|
"loss": 0.9039, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.13738441215323646, |
|
"grad_norm": 1.6507084509279148, |
|
"learning_rate": 2.2127659574468088e-05, |
|
"loss": 0.8965, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.14795244385733158, |
|
"grad_norm": 1.726225549814637, |
|
"learning_rate": 2.3829787234042553e-05, |
|
"loss": 0.8849, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.15852047556142668, |
|
"grad_norm": 1.6588209807553262, |
|
"learning_rate": 2.5531914893617025e-05, |
|
"loss": 0.871, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.1690885072655218, |
|
"grad_norm": 1.2143860211983681, |
|
"learning_rate": 2.723404255319149e-05, |
|
"loss": 0.8617, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.17965653896961692, |
|
"grad_norm": 1.2868680178333174, |
|
"learning_rate": 2.8936170212765963e-05, |
|
"loss": 0.8567, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.190224570673712, |
|
"grad_norm": 1.2037257451837027, |
|
"learning_rate": 3.063829787234043e-05, |
|
"loss": 0.8434, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.20079260237780713, |
|
"grad_norm": 1.041214478985964, |
|
"learning_rate": 3.234042553191489e-05, |
|
"loss": 0.8371, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.21136063408190225, |
|
"grad_norm": 0.9037174605431649, |
|
"learning_rate": 3.4042553191489365e-05, |
|
"loss": 0.825, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.22192866578599735, |
|
"grad_norm": 1.3140952321772827, |
|
"learning_rate": 3.574468085106383e-05, |
|
"loss": 0.8314, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.23249669749009247, |
|
"grad_norm": 1.3446525845289001, |
|
"learning_rate": 3.74468085106383e-05, |
|
"loss": 0.8274, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.2430647291941876, |
|
"grad_norm": 1.202878622232685, |
|
"learning_rate": 3.914893617021277e-05, |
|
"loss": 0.8126, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.2536327608982827, |
|
"grad_norm": 0.9767090732567484, |
|
"learning_rate": 4.085106382978723e-05, |
|
"loss": 0.806, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.26420079260237783, |
|
"grad_norm": 1.7139214140351846, |
|
"learning_rate": 4.2553191489361704e-05, |
|
"loss": 0.8059, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.2747688243064729, |
|
"grad_norm": 1.0610822836711675, |
|
"learning_rate": 4.4255319148936176e-05, |
|
"loss": 0.7991, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.285336856010568, |
|
"grad_norm": 0.9499105971028553, |
|
"learning_rate": 4.595744680851065e-05, |
|
"loss": 0.7972, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.29590488771466317, |
|
"grad_norm": 1.7217798288469996, |
|
"learning_rate": 4.765957446808511e-05, |
|
"loss": 0.7937, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.30647291941875826, |
|
"grad_norm": 1.0574506769778116, |
|
"learning_rate": 4.936170212765958e-05, |
|
"loss": 0.7897, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.31704095112285335, |
|
"grad_norm": 1.9202057568924744, |
|
"learning_rate": 5.106382978723405e-05, |
|
"loss": 0.7991, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.3276089828269485, |
|
"grad_norm": 1.345605492975133, |
|
"learning_rate": 5.276595744680851e-05, |
|
"loss": 0.7846, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.3381770145310436, |
|
"grad_norm": 1.6956356026369743, |
|
"learning_rate": 5.446808510638298e-05, |
|
"loss": 0.791, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.3487450462351387, |
|
"grad_norm": 1.5678728230382326, |
|
"learning_rate": 5.617021276595745e-05, |
|
"loss": 0.7865, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.35931307793923384, |
|
"grad_norm": 1.1508014915876, |
|
"learning_rate": 5.7872340425531925e-05, |
|
"loss": 0.7796, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.36988110964332893, |
|
"grad_norm": 1.5510984064838533, |
|
"learning_rate": 5.9574468085106384e-05, |
|
"loss": 0.7771, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.380449141347424, |
|
"grad_norm": 1.400660302660845, |
|
"learning_rate": 6.127659574468086e-05, |
|
"loss": 0.7693, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.3910171730515192, |
|
"grad_norm": 1.390600461038752, |
|
"learning_rate": 6.297872340425533e-05, |
|
"loss": 0.7647, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.40158520475561427, |
|
"grad_norm": 1.3619693240026056, |
|
"learning_rate": 6.468085106382979e-05, |
|
"loss": 0.7704, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.41215323645970936, |
|
"grad_norm": 1.4593264335764384, |
|
"learning_rate": 6.638297872340426e-05, |
|
"loss": 0.7534, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.4227212681638045, |
|
"grad_norm": 1.3716746153165715, |
|
"learning_rate": 6.808510638297873e-05, |
|
"loss": 0.7614, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.4332892998678996, |
|
"grad_norm": 1.6638664908483813, |
|
"learning_rate": 6.97872340425532e-05, |
|
"loss": 0.7562, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.4438573315719947, |
|
"grad_norm": 1.1519315113645128, |
|
"learning_rate": 7.148936170212766e-05, |
|
"loss": 0.7493, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.45442536327608984, |
|
"grad_norm": 1.2801946128196509, |
|
"learning_rate": 7.319148936170213e-05, |
|
"loss": 0.7488, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.46499339498018494, |
|
"grad_norm": 63.9505110490317, |
|
"learning_rate": 7.48936170212766e-05, |
|
"loss": 0.7697, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.47556142668428003, |
|
"grad_norm": 2.589351765723454, |
|
"learning_rate": 7.659574468085108e-05, |
|
"loss": 0.7888, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.4861294583883752, |
|
"grad_norm": 1.302706513183853, |
|
"learning_rate": 7.829787234042553e-05, |
|
"loss": 0.7518, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.4966974900924703, |
|
"grad_norm": 1.968548255582277, |
|
"learning_rate": 8e-05, |
|
"loss": 0.771, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.5072655217965654, |
|
"grad_norm": 1.4317196073460219, |
|
"learning_rate": 7.999889681839899e-05, |
|
"loss": 0.7447, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.5178335535006605, |
|
"grad_norm": 1.4264803539125535, |
|
"learning_rate": 7.999558733444641e-05, |
|
"loss": 0.7495, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.5284015852047557, |
|
"grad_norm": 1.8250063568248835, |
|
"learning_rate": 7.999007173069037e-05, |
|
"loss": 0.7507, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5389696169088507, |
|
"grad_norm": 32.42595517441195, |
|
"learning_rate": 7.998235031136648e-05, |
|
"loss": 0.8633, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.5495376486129459, |
|
"grad_norm": 2.457580825395341, |
|
"learning_rate": 7.997242350238117e-05, |
|
"loss": 0.7888, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.560105680317041, |
|
"grad_norm": 1.073949064797192, |
|
"learning_rate": 7.996029185128804e-05, |
|
"loss": 0.7334, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.570673712021136, |
|
"grad_norm": 1.7771625101062152, |
|
"learning_rate": 7.994595602725781e-05, |
|
"loss": 0.7529, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.5812417437252312, |
|
"grad_norm": 1.461447259519702, |
|
"learning_rate": 7.992941682104139e-05, |
|
"loss": 0.7563, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.5918097754293263, |
|
"grad_norm": 1.3789190576230315, |
|
"learning_rate": 7.991067514492614e-05, |
|
"loss": 0.7376, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.6023778071334214, |
|
"grad_norm": 1.0632278085259756, |
|
"learning_rate": 7.988973203268567e-05, |
|
"loss": 0.7418, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.6129458388375165, |
|
"grad_norm": 1.3753612784385743, |
|
"learning_rate": 7.986658863952281e-05, |
|
"loss": 0.7302, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.6235138705416117, |
|
"grad_norm": 1.3066179113984184, |
|
"learning_rate": 7.984124624200583e-05, |
|
"loss": 0.7397, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.6340819022457067, |
|
"grad_norm": 1.1484727063789566, |
|
"learning_rate": 7.981370623799803e-05, |
|
"loss": 0.7255, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.6446499339498019, |
|
"grad_norm": 0.992784588525566, |
|
"learning_rate": 7.978397014658075e-05, |
|
"loss": 0.7272, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.655217965653897, |
|
"grad_norm": 0.9709816709205683, |
|
"learning_rate": 7.97520396079694e-05, |
|
"loss": 0.7244, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.665785997357992, |
|
"grad_norm": 1.410223766064041, |
|
"learning_rate": 7.971791638342313e-05, |
|
"loss": 0.7277, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.6763540290620872, |
|
"grad_norm": 0.9859003082910416, |
|
"learning_rate": 7.96816023551476e-05, |
|
"loss": 0.7149, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.6869220607661823, |
|
"grad_norm": 1.0627873899695728, |
|
"learning_rate": 7.964309952619121e-05, |
|
"loss": 0.7076, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.6974900924702774, |
|
"grad_norm": 1.2987953031941966, |
|
"learning_rate": 7.96024100203346e-05, |
|
"loss": 0.7148, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.7080581241743725, |
|
"grad_norm": 0.9847502480003636, |
|
"learning_rate": 7.955953608197345e-05, |
|
"loss": 0.7192, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.7186261558784677, |
|
"grad_norm": 1.2534186109561034, |
|
"learning_rate": 7.951448007599478e-05, |
|
"loss": 0.7187, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.7291941875825627, |
|
"grad_norm": 1.1180727067911638, |
|
"learning_rate": 7.946724448764644e-05, |
|
"loss": 0.7093, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.7397622192866579, |
|
"grad_norm": 1.3232754557732016, |
|
"learning_rate": 7.94178319224e-05, |
|
"loss": 0.7124, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.750330250990753, |
|
"grad_norm": 0.9127404976454525, |
|
"learning_rate": 7.936624510580712e-05, |
|
"loss": 0.7075, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.760898282694848, |
|
"grad_norm": 0.769180480882775, |
|
"learning_rate": 7.931248688334915e-05, |
|
"loss": 0.6912, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.7714663143989432, |
|
"grad_norm": 0.8883585292926546, |
|
"learning_rate": 7.925656022028017e-05, |
|
"loss": 0.6987, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.7820343461030383, |
|
"grad_norm": 0.963883950564473, |
|
"learning_rate": 7.919846820146348e-05, |
|
"loss": 0.704, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.7926023778071334, |
|
"grad_norm": 0.8863497475275227, |
|
"learning_rate": 7.913821403120139e-05, |
|
"loss": 0.6945, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.8031704095112285, |
|
"grad_norm": 1.0426799292571676, |
|
"learning_rate": 7.90758010330585e-05, |
|
"loss": 0.6849, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.8137384412153237, |
|
"grad_norm": 0.8243890076961332, |
|
"learning_rate": 7.901123264967836e-05, |
|
"loss": 0.6979, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.8243064729194187, |
|
"grad_norm": 1.1652167740266006, |
|
"learning_rate": 7.894451244259363e-05, |
|
"loss": 0.6905, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.8348745046235139, |
|
"grad_norm": 1.2465660396622233, |
|
"learning_rate": 7.887564409202953e-05, |
|
"loss": 0.6872, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.845442536327609, |
|
"grad_norm": 0.5355690010674495, |
|
"learning_rate": 7.880463139670091e-05, |
|
"loss": 0.6848, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.8560105680317041, |
|
"grad_norm": 1.419104255807733, |
|
"learning_rate": 7.873147827360273e-05, |
|
"loss": 0.6878, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.8665785997357992, |
|
"grad_norm": 0.6749893210811997, |
|
"learning_rate": 7.865618875779398e-05, |
|
"loss": 0.678, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.8771466314398944, |
|
"grad_norm": 0.7732270985786504, |
|
"learning_rate": 7.857876700217508e-05, |
|
"loss": 0.6848, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.8877146631439894, |
|
"grad_norm": 0.6755156642812311, |
|
"learning_rate": 7.849921727725882e-05, |
|
"loss": 0.679, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.8982826948480845, |
|
"grad_norm": 0.6421189276197394, |
|
"learning_rate": 7.841754397093487e-05, |
|
"loss": 0.6728, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.9088507265521797, |
|
"grad_norm": 0.49153205280258144, |
|
"learning_rate": 7.833375158822766e-05, |
|
"loss": 0.6827, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.9194187582562747, |
|
"grad_norm": 0.7083774145182355, |
|
"learning_rate": 7.824784475104795e-05, |
|
"loss": 0.6813, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.9299867899603699, |
|
"grad_norm": 1.1312836778877742, |
|
"learning_rate": 7.815982819793784e-05, |
|
"loss": 0.6776, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.940554821664465, |
|
"grad_norm": 1.4566456596110349, |
|
"learning_rate": 7.806970678380943e-05, |
|
"loss": 0.6828, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.9511228533685601, |
|
"grad_norm": 0.641648090796948, |
|
"learning_rate": 7.797748547967701e-05, |
|
"loss": 0.6689, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.9616908850726552, |
|
"grad_norm": 1.2689027097316996, |
|
"learning_rate": 7.788316937238287e-05, |
|
"loss": 0.6887, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.9722589167767504, |
|
"grad_norm": 1.0288877116771369, |
|
"learning_rate": 7.778676366431676e-05, |
|
"loss": 0.6731, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.9828269484808454, |
|
"grad_norm": 1.111134713608855, |
|
"learning_rate": 7.768827367312882e-05, |
|
"loss": 0.6772, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.9933949801849405, |
|
"grad_norm": 0.9415602963246132, |
|
"learning_rate": 7.758770483143634e-05, |
|
"loss": 0.6764, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.0039630118890357, |
|
"grad_norm": 0.766369432032264, |
|
"learning_rate": 7.748506268652415e-05, |
|
"loss": 0.6581, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.0145310435931307, |
|
"grad_norm": 0.6235125904372976, |
|
"learning_rate": 7.73803529000385e-05, |
|
"loss": 0.639, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.025099075297226, |
|
"grad_norm": 0.5687108049822664, |
|
"learning_rate": 7.727358124767491e-05, |
|
"loss": 0.6307, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.035667107001321, |
|
"grad_norm": 0.7768949803155286, |
|
"learning_rate": 7.71647536188595e-05, |
|
"loss": 0.6287, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.046235138705416, |
|
"grad_norm": 1.0822134265138843, |
|
"learning_rate": 7.705387601642416e-05, |
|
"loss": 0.6316, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.0568031704095113, |
|
"grad_norm": 0.6600814825591903, |
|
"learning_rate": 7.694095455627542e-05, |
|
"loss": 0.6209, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.0673712021136064, |
|
"grad_norm": 0.5092321091554813, |
|
"learning_rate": 7.682599546705716e-05, |
|
"loss": 0.6268, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.0779392338177014, |
|
"grad_norm": 0.5538152767155732, |
|
"learning_rate": 7.670900508980697e-05, |
|
"loss": 0.629, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.0885072655217967, |
|
"grad_norm": 0.7759780022913368, |
|
"learning_rate": 7.658998987760645e-05, |
|
"loss": 0.6266, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.0990752972258917, |
|
"grad_norm": 1.0071580985859072, |
|
"learning_rate": 7.646895639522518e-05, |
|
"loss": 0.634, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.1096433289299867, |
|
"grad_norm": 0.9913392623454966, |
|
"learning_rate": 7.634591131875875e-05, |
|
"loss": 0.6334, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.120211360634082, |
|
"grad_norm": 0.9887888884996264, |
|
"learning_rate": 7.622086143526036e-05, |
|
"loss": 0.6271, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.130779392338177, |
|
"grad_norm": 1.0219278471051096, |
|
"learning_rate": 7.609381364236655e-05, |
|
"loss": 0.6224, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.141347424042272, |
|
"grad_norm": 0.9226284872759164, |
|
"learning_rate": 7.59647749479167e-05, |
|
"loss": 0.62, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.1519154557463673, |
|
"grad_norm": 0.9215854604914202, |
|
"learning_rate": 7.583375246956648e-05, |
|
"loss": 0.6295, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.1624834874504624, |
|
"grad_norm": 1.2986154846688018, |
|
"learning_rate": 7.570075343439526e-05, |
|
"loss": 0.6245, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.1730515191545574, |
|
"grad_norm": 0.5049907163605463, |
|
"learning_rate": 7.556578517850747e-05, |
|
"loss": 0.6244, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.1836195508586527, |
|
"grad_norm": 0.8014907412493998, |
|
"learning_rate": 7.542885514662794e-05, |
|
"loss": 0.6233, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.1941875825627477, |
|
"grad_norm": 1.1622767091260562, |
|
"learning_rate": 7.528997089169128e-05, |
|
"loss": 0.6235, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.2047556142668427, |
|
"grad_norm": 0.7380452282238854, |
|
"learning_rate": 7.51491400744252e-05, |
|
"loss": 0.6228, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.215323645970938, |
|
"grad_norm": 0.7597600873008943, |
|
"learning_rate": 7.500637046292803e-05, |
|
"loss": 0.621, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.225891677675033, |
|
"grad_norm": 0.7234884283262764, |
|
"learning_rate": 7.48616699322402e-05, |
|
"loss": 0.6194, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.236459709379128, |
|
"grad_norm": 0.49171596139374824, |
|
"learning_rate": 7.471504646390987e-05, |
|
"loss": 0.6197, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.2470277410832233, |
|
"grad_norm": 0.6952021893115369, |
|
"learning_rate": 7.456650814555267e-05, |
|
"loss": 0.6185, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.2575957727873184, |
|
"grad_norm": 0.7911960205209884, |
|
"learning_rate": 7.441606317040558e-05, |
|
"loss": 0.6204, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.2681638044914134, |
|
"grad_norm": 0.5604716814050085, |
|
"learning_rate": 7.426371983687503e-05, |
|
"loss": 0.6136, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.2787318361955085, |
|
"grad_norm": 0.5391019590519042, |
|
"learning_rate": 7.410948654807916e-05, |
|
"loss": 0.6141, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.2892998678996037, |
|
"grad_norm": 0.4473178756510967, |
|
"learning_rate": 7.39533718113843e-05, |
|
"loss": 0.618, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.2998678996036988, |
|
"grad_norm": 0.3676324142572552, |
|
"learning_rate": 7.379538423793568e-05, |
|
"loss": 0.6181, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.310435931307794, |
|
"grad_norm": 0.49641800089735266, |
|
"learning_rate": 7.363553254218253e-05, |
|
"loss": 0.6162, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.321003963011889, |
|
"grad_norm": 0.5596386402088841, |
|
"learning_rate": 7.347382554139733e-05, |
|
"loss": 0.6137, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.331571994715984, |
|
"grad_norm": 0.37385072844495554, |
|
"learning_rate": 7.331027215518949e-05, |
|
"loss": 0.6047, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.3421400264200791, |
|
"grad_norm": 0.46801024402125285, |
|
"learning_rate": 7.31448814050133e-05, |
|
"loss": 0.6074, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.3527080581241744, |
|
"grad_norm": 0.38199501822744514, |
|
"learning_rate": 7.297766241367041e-05, |
|
"loss": 0.6081, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.3632760898282694, |
|
"grad_norm": 0.2644636432614496, |
|
"learning_rate": 7.280862440480658e-05, |
|
"loss": 0.6083, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.3738441215323647, |
|
"grad_norm": 0.3642023310466234, |
|
"learning_rate": 7.263777670240282e-05, |
|
"loss": 0.6163, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.3844121532364597, |
|
"grad_norm": 0.3286254245460909, |
|
"learning_rate": 7.246512873026125e-05, |
|
"loss": 0.6105, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.3949801849405548, |
|
"grad_norm": 0.27645048682447476, |
|
"learning_rate": 7.229069001148518e-05, |
|
"loss": 0.6047, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.40554821664465, |
|
"grad_norm": 0.31935620714929114, |
|
"learning_rate": 7.211447016795388e-05, |
|
"loss": 0.6159, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.416116248348745, |
|
"grad_norm": 0.27637648695223127, |
|
"learning_rate": 7.193647891979177e-05, |
|
"loss": 0.6076, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.42668428005284, |
|
"grad_norm": 0.2632383089776466, |
|
"learning_rate": 7.17567260848324e-05, |
|
"loss": 0.6134, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.4372523117569354, |
|
"grad_norm": 0.36576847146483216, |
|
"learning_rate": 7.157522157807675e-05, |
|
"loss": 0.6097, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.4478203434610304, |
|
"grad_norm": 0.49829210057276635, |
|
"learning_rate": 7.139197541114645e-05, |
|
"loss": 0.6076, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.4583883751651254, |
|
"grad_norm": 0.6715032042462543, |
|
"learning_rate": 7.120699769173149e-05, |
|
"loss": 0.6079, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.4689564068692207, |
|
"grad_norm": 0.8872767611634563, |
|
"learning_rate": 7.10202986230327e-05, |
|
"loss": 0.6106, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.4795244385733157, |
|
"grad_norm": 1.3094337587734477, |
|
"learning_rate": 7.083188850319895e-05, |
|
"loss": 0.6249, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.4900924702774108, |
|
"grad_norm": 0.5732521894924236, |
|
"learning_rate": 7.064177772475912e-05, |
|
"loss": 0.6184, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.500660501981506, |
|
"grad_norm": 0.5968155749547341, |
|
"learning_rate": 7.044997677404888e-05, |
|
"loss": 0.6132, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.511228533685601, |
|
"grad_norm": 1.2108186694176202, |
|
"learning_rate": 7.025649623063223e-05, |
|
"loss": 0.6261, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.521796565389696, |
|
"grad_norm": 0.7492638271367434, |
|
"learning_rate": 7.006134676671791e-05, |
|
"loss": 0.6097, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.5323645970937911, |
|
"grad_norm": 0.520895839877394, |
|
"learning_rate": 6.986453914657083e-05, |
|
"loss": 0.6097, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.5429326287978864, |
|
"grad_norm": 0.6782854061408391, |
|
"learning_rate": 6.96660842259183e-05, |
|
"loss": 0.6175, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.5535006605019817, |
|
"grad_norm": 0.5917212461488441, |
|
"learning_rate": 6.946599295135116e-05, |
|
"loss": 0.6142, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.5640686922060767, |
|
"grad_norm": 0.5981371046939804, |
|
"learning_rate": 6.926427635972003e-05, |
|
"loss": 0.6083, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.5746367239101717, |
|
"grad_norm": 0.8111312440510982, |
|
"learning_rate": 6.906094557752654e-05, |
|
"loss": 0.6126, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.5852047556142668, |
|
"grad_norm": 0.9645422516897456, |
|
"learning_rate": 6.885601182030958e-05, |
|
"loss": 0.6143, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.5957727873183618, |
|
"grad_norm": 0.8593693771282125, |
|
"learning_rate": 6.864948639202667e-05, |
|
"loss": 0.6165, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.606340819022457, |
|
"grad_norm": 0.6782193407228015, |
|
"learning_rate": 6.844138068443043e-05, |
|
"loss": 0.617, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.6169088507265523, |
|
"grad_norm": 0.5133505641873833, |
|
"learning_rate": 6.823170617644029e-05, |
|
"loss": 0.6097, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.6274768824306474, |
|
"grad_norm": 0.3188527654318755, |
|
"learning_rate": 6.802047443350915e-05, |
|
"loss": 0.6019, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.6380449141347424, |
|
"grad_norm": 0.36046564912487544, |
|
"learning_rate": 6.78076971069857e-05, |
|
"loss": 0.603, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.6486129458388374, |
|
"grad_norm": 0.3434440344937455, |
|
"learning_rate": 6.759338593347148e-05, |
|
"loss": 0.614, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.6591809775429325, |
|
"grad_norm": 0.4207528199987266, |
|
"learning_rate": 6.737755273417367e-05, |
|
"loss": 0.6057, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.6697490092470277, |
|
"grad_norm": 0.4376375758003335, |
|
"learning_rate": 6.716020941425302e-05, |
|
"loss": 0.6101, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.680317040951123, |
|
"grad_norm": 0.3670986836675996, |
|
"learning_rate": 6.694136796216706e-05, |
|
"loss": 0.6074, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.690885072655218, |
|
"grad_norm": 0.3716710535382814, |
|
"learning_rate": 6.672104044900901e-05, |
|
"loss": 0.607, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.701453104359313, |
|
"grad_norm": 0.29349567743537014, |
|
"learning_rate": 6.649923902784178e-05, |
|
"loss": 0.6049, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.7120211360634081, |
|
"grad_norm": 0.37774756737109916, |
|
"learning_rate": 6.627597593302772e-05, |
|
"loss": 0.6004, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.7225891677675031, |
|
"grad_norm": 0.34520927764325005, |
|
"learning_rate": 6.605126347955376e-05, |
|
"loss": 0.5987, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.7331571994715984, |
|
"grad_norm": 0.37358546648640356, |
|
"learning_rate": 6.58251140623521e-05, |
|
"loss": 0.6037, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.7437252311756937, |
|
"grad_norm": 0.3803331681549159, |
|
"learning_rate": 6.559754015561655e-05, |
|
"loss": 0.6057, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.7542932628797887, |
|
"grad_norm": 0.419367642300496, |
|
"learning_rate": 6.536855431211445e-05, |
|
"loss": 0.6006, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.7648612945838837, |
|
"grad_norm": 0.3700144217552004, |
|
"learning_rate": 6.513816916249427e-05, |
|
"loss": 0.6029, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.7754293262879788, |
|
"grad_norm": 0.3304108780784769, |
|
"learning_rate": 6.490639741458891e-05, |
|
"loss": 0.609, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.7859973579920738, |
|
"grad_norm": 0.38491358241980217, |
|
"learning_rate": 6.46732518527148e-05, |
|
"loss": 0.5992, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.796565389696169, |
|
"grad_norm": 0.41792070367182416, |
|
"learning_rate": 6.443874533696662e-05, |
|
"loss": 0.5997, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.8071334214002643, |
|
"grad_norm": 0.31857578884550497, |
|
"learning_rate": 6.420289080250804e-05, |
|
"loss": 0.5976, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.8177014531043594, |
|
"grad_norm": 0.2921197050452141, |
|
"learning_rate": 6.396570125885823e-05, |
|
"loss": 0.6028, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.8282694848084544, |
|
"grad_norm": 0.32956079744279115, |
|
"learning_rate": 6.372718978917421e-05, |
|
"loss": 0.5959, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.8388375165125495, |
|
"grad_norm": 0.3363640406873935, |
|
"learning_rate": 6.348736954952923e-05, |
|
"loss": 0.5979, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.8494055482166445, |
|
"grad_norm": 0.2827997180555132, |
|
"learning_rate": 6.324625376818707e-05, |
|
"loss": 0.5963, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.8599735799207398, |
|
"grad_norm": 0.24952707846951472, |
|
"learning_rate": 6.300385574487242e-05, |
|
"loss": 0.5962, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.870541611624835, |
|
"grad_norm": 0.30585781576809906, |
|
"learning_rate": 6.276018885003727e-05, |
|
"loss": 0.5979, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.88110964332893, |
|
"grad_norm": 0.35383483624278156, |
|
"learning_rate": 6.251526652412335e-05, |
|
"loss": 0.5975, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.891677675033025, |
|
"grad_norm": 0.3057303385681727, |
|
"learning_rate": 6.226910227682087e-05, |
|
"loss": 0.5974, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.9022457067371201, |
|
"grad_norm": 0.3227218227112165, |
|
"learning_rate": 6.202170968632324e-05, |
|
"loss": 0.5984, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.9128137384412152, |
|
"grad_norm": 0.3099468501510001, |
|
"learning_rate": 6.177310239857815e-05, |
|
"loss": 0.6006, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.9233817701453104, |
|
"grad_norm": 0.3029754372937815, |
|
"learning_rate": 6.152329412653491e-05, |
|
"loss": 0.5937, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.9339498018494057, |
|
"grad_norm": 0.31003590660857305, |
|
"learning_rate": 6.127229864938798e-05, |
|
"loss": 0.6031, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.9445178335535007, |
|
"grad_norm": 0.4090438325379304, |
|
"learning_rate": 6.1020129811816985e-05, |
|
"loss": 0.5995, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.9550858652575958, |
|
"grad_norm": 0.47046399479064277, |
|
"learning_rate": 6.076680152322302e-05, |
|
"loss": 0.597, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.9656538969616908, |
|
"grad_norm": 0.4389393519962548, |
|
"learning_rate": 6.051232775696143e-05, |
|
"loss": 0.6003, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.9762219286657858, |
|
"grad_norm": 0.3946851961823208, |
|
"learning_rate": 6.025672254957106e-05, |
|
"loss": 0.5961, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.986789960369881, |
|
"grad_norm": 0.3581648113750019, |
|
"learning_rate": 6.000000000000001e-05, |
|
"loss": 0.6014, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.9973579920739764, |
|
"grad_norm": 0.32698044327640924, |
|
"learning_rate": 5.9742174268827936e-05, |
|
"loss": 0.6018, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 2.0079260237780714, |
|
"grad_norm": 0.3725595521245188, |
|
"learning_rate": 5.948325957748498e-05, |
|
"loss": 0.5527, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.0184940554821664, |
|
"grad_norm": 0.40702510391048585, |
|
"learning_rate": 5.9223270207467355e-05, |
|
"loss": 0.5457, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 2.0290620871862615, |
|
"grad_norm": 0.5137742231756651, |
|
"learning_rate": 5.896222049954951e-05, |
|
"loss": 0.5365, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 2.0396301188903565, |
|
"grad_norm": 0.5833504408833431, |
|
"learning_rate": 5.870012485299318e-05, |
|
"loss": 0.5339, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 2.050198150594452, |
|
"grad_norm": 0.5020674212672895, |
|
"learning_rate": 5.843699772475312e-05, |
|
"loss": 0.5344, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 2.060766182298547, |
|
"grad_norm": 0.5109669618772644, |
|
"learning_rate": 5.8172853628679676e-05, |
|
"loss": 0.5373, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 2.071334214002642, |
|
"grad_norm": 0.6079777296591362, |
|
"learning_rate": 5.790770713471816e-05, |
|
"loss": 0.5338, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 2.081902245706737, |
|
"grad_norm": 0.7717054202322884, |
|
"learning_rate": 5.764157286810527e-05, |
|
"loss": 0.5413, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 2.092470277410832, |
|
"grad_norm": 1.0263835129177117, |
|
"learning_rate": 5.7374465508562324e-05, |
|
"loss": 0.5419, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 2.103038309114927, |
|
"grad_norm": 1.1304188828796682, |
|
"learning_rate": 5.710639978948555e-05, |
|
"loss": 0.5401, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 2.1136063408190227, |
|
"grad_norm": 0.5561546704438505, |
|
"learning_rate": 5.6837390497133406e-05, |
|
"loss": 0.5371, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.1241743725231177, |
|
"grad_norm": 0.5094878513846206, |
|
"learning_rate": 5.6567452469810984e-05, |
|
"loss": 0.5307, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 2.1347424042272127, |
|
"grad_norm": 0.861314347361121, |
|
"learning_rate": 5.629660059705153e-05, |
|
"loss": 0.5405, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 2.1453104359313078, |
|
"grad_norm": 0.7744305400405808, |
|
"learning_rate": 5.602484981879519e-05, |
|
"loss": 0.5399, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 2.155878467635403, |
|
"grad_norm": 0.5568154422551339, |
|
"learning_rate": 5.5752215124564895e-05, |
|
"loss": 0.534, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 2.166446499339498, |
|
"grad_norm": 0.4932283903917851, |
|
"learning_rate": 5.547871155263955e-05, |
|
"loss": 0.5427, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 2.1770145310435933, |
|
"grad_norm": 0.6026007057555373, |
|
"learning_rate": 5.5204354189224596e-05, |
|
"loss": 0.5372, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 2.1875825627476884, |
|
"grad_norm": 0.5907730425638954, |
|
"learning_rate": 5.492915816761979e-05, |
|
"loss": 0.5339, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 2.1981505944517834, |
|
"grad_norm": 0.38702716198497655, |
|
"learning_rate": 5.465313866738454e-05, |
|
"loss": 0.5399, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 2.2087186261558784, |
|
"grad_norm": 0.4818332449688006, |
|
"learning_rate": 5.4376310913500514e-05, |
|
"loss": 0.5341, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 2.2192866578599735, |
|
"grad_norm": 0.48527023542347636, |
|
"learning_rate": 5.409869017553199e-05, |
|
"loss": 0.5443, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.2298546895640685, |
|
"grad_norm": 0.38988747831854015, |
|
"learning_rate": 5.382029176678345e-05, |
|
"loss": 0.5325, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 2.240422721268164, |
|
"grad_norm": 0.3382464323770342, |
|
"learning_rate": 5.354113104345503e-05, |
|
"loss": 0.5381, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 2.250990752972259, |
|
"grad_norm": 0.344596068112714, |
|
"learning_rate": 5.326122340379539e-05, |
|
"loss": 0.5393, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 2.261558784676354, |
|
"grad_norm": 0.38043335701597286, |
|
"learning_rate": 5.2980584287252456e-05, |
|
"loss": 0.5354, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 2.272126816380449, |
|
"grad_norm": 0.31330954952683393, |
|
"learning_rate": 5.269922917362171e-05, |
|
"loss": 0.5347, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 2.282694848084544, |
|
"grad_norm": 0.3394061867349534, |
|
"learning_rate": 5.241717358219239e-05, |
|
"loss": 0.5359, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 2.293262879788639, |
|
"grad_norm": 0.353132616874619, |
|
"learning_rate": 5.213443307089144e-05, |
|
"loss": 0.53, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 2.3038309114927347, |
|
"grad_norm": 0.2871048446747761, |
|
"learning_rate": 5.1851023235425366e-05, |
|
"loss": 0.5396, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 2.3143989431968297, |
|
"grad_norm": 0.3676997391938742, |
|
"learning_rate": 5.156695970841997e-05, |
|
"loss": 0.5319, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 2.3249669749009247, |
|
"grad_norm": 0.3034692647102483, |
|
"learning_rate": 5.128225815855805e-05, |
|
"loss": 0.539, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.33553500660502, |
|
"grad_norm": 0.29670953140630685, |
|
"learning_rate": 5.099693428971522e-05, |
|
"loss": 0.5357, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 2.346103038309115, |
|
"grad_norm": 0.3462570910330776, |
|
"learning_rate": 5.0711003840093583e-05, |
|
"loss": 0.5382, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 2.35667107001321, |
|
"grad_norm": 0.30320212822308745, |
|
"learning_rate": 5.042448258135371e-05, |
|
"loss": 0.5398, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 2.3672391017173053, |
|
"grad_norm": 0.3253569159099642, |
|
"learning_rate": 5.013738631774463e-05, |
|
"loss": 0.5403, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 2.3778071334214004, |
|
"grad_norm": 0.2992173696284864, |
|
"learning_rate": 4.984973088523216e-05, |
|
"loss": 0.5318, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.3883751651254954, |
|
"grad_norm": 0.23806839402955893, |
|
"learning_rate": 4.9561532150625305e-05, |
|
"loss": 0.5295, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 2.3989431968295905, |
|
"grad_norm": 0.2836158566149504, |
|
"learning_rate": 4.927280601070114e-05, |
|
"loss": 0.5273, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 2.4095112285336855, |
|
"grad_norm": 0.24042182902261217, |
|
"learning_rate": 4.898356839132793e-05, |
|
"loss": 0.5302, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 2.4200792602377805, |
|
"grad_norm": 0.23090066826959527, |
|
"learning_rate": 4.869383524658668e-05, |
|
"loss": 0.5378, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 2.430647291941876, |
|
"grad_norm": 0.23036421955638248, |
|
"learning_rate": 4.840362255789112e-05, |
|
"loss": 0.5324, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.441215323645971, |
|
"grad_norm": 0.2086043982624101, |
|
"learning_rate": 4.811294633310617e-05, |
|
"loss": 0.5355, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 2.451783355350066, |
|
"grad_norm": 0.2662049795738657, |
|
"learning_rate": 4.782182260566498e-05, |
|
"loss": 0.5321, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 2.462351387054161, |
|
"grad_norm": 0.17282690102361992, |
|
"learning_rate": 4.7530267433684546e-05, |
|
"loss": 0.5322, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 2.472919418758256, |
|
"grad_norm": 0.23839732691466814, |
|
"learning_rate": 4.723829689907993e-05, |
|
"loss": 0.5332, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 2.483487450462351, |
|
"grad_norm": 0.220755685419314, |
|
"learning_rate": 4.694592710667723e-05, |
|
"loss": 0.5289, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.4940554821664467, |
|
"grad_norm": 0.17887769697236458, |
|
"learning_rate": 4.665317418332521e-05, |
|
"loss": 0.5302, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 2.5046235138705417, |
|
"grad_norm": 0.21214814477629076, |
|
"learning_rate": 4.6360054277005826e-05, |
|
"loss": 0.5311, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 2.5151915455746368, |
|
"grad_norm": 0.169870987072969, |
|
"learning_rate": 4.606658355594344e-05, |
|
"loss": 0.5308, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 2.525759577278732, |
|
"grad_norm": 0.23328757071293307, |
|
"learning_rate": 4.577277820771307e-05, |
|
"loss": 0.531, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 2.536327608982827, |
|
"grad_norm": 0.2082393856633939, |
|
"learning_rate": 4.5478654438347414e-05, |
|
"loss": 0.5369, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.5468956406869223, |
|
"grad_norm": 0.14478055321484706, |
|
"learning_rate": 4.518422847144304e-05, |
|
"loss": 0.5323, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 2.557463672391017, |
|
"grad_norm": 0.22316005053591764, |
|
"learning_rate": 4.488951654726539e-05, |
|
"loss": 0.5286, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 2.5680317040951124, |
|
"grad_norm": 0.17285901709749715, |
|
"learning_rate": 4.4594534921853096e-05, |
|
"loss": 0.5362, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 2.5785997357992074, |
|
"grad_norm": 0.2122181023189799, |
|
"learning_rate": 4.429929986612125e-05, |
|
"loss": 0.5351, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 2.5891677675033025, |
|
"grad_norm": 0.19160763459008834, |
|
"learning_rate": 4.400382766496394e-05, |
|
"loss": 0.5322, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.5997357992073975, |
|
"grad_norm": 0.17931339221155976, |
|
"learning_rate": 4.3708134616355934e-05, |
|
"loss": 0.5291, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 2.6103038309114925, |
|
"grad_norm": 0.18662561546217404, |
|
"learning_rate": 4.341223703045379e-05, |
|
"loss": 0.5348, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 2.620871862615588, |
|
"grad_norm": 0.15830313964721893, |
|
"learning_rate": 4.311615122869613e-05, |
|
"loss": 0.5434, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 2.631439894319683, |
|
"grad_norm": 0.19402467010937804, |
|
"learning_rate": 4.281989354290341e-05, |
|
"loss": 0.5433, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 2.642007926023778, |
|
"grad_norm": 0.20042343107408042, |
|
"learning_rate": 4.2523480314376996e-05, |
|
"loss": 0.5327, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.652575957727873, |
|
"grad_norm": 0.165259104609407, |
|
"learning_rate": 4.222692789299794e-05, |
|
"loss": 0.5389, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 2.663143989431968, |
|
"grad_norm": 0.21602254674705257, |
|
"learning_rate": 4.193025263632495e-05, |
|
"loss": 0.5254, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 2.6737120211360637, |
|
"grad_norm": 0.1679859643732669, |
|
"learning_rate": 4.163347090869227e-05, |
|
"loss": 0.5375, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 2.6842800528401582, |
|
"grad_norm": 0.17225237105164173, |
|
"learning_rate": 4.133659908030699e-05, |
|
"loss": 0.5342, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 2.6948480845442537, |
|
"grad_norm": 0.1620251583234328, |
|
"learning_rate": 4.103965352634604e-05, |
|
"loss": 0.5328, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.7054161162483488, |
|
"grad_norm": 0.16680355622763784, |
|
"learning_rate": 4.0742650626053004e-05, |
|
"loss": 0.5293, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 2.715984147952444, |
|
"grad_norm": 0.15521980569419397, |
|
"learning_rate": 4.044560676183462e-05, |
|
"loss": 0.5371, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 2.726552179656539, |
|
"grad_norm": 0.1574325532000935, |
|
"learning_rate": 4.014853831835721e-05, |
|
"loss": 0.5331, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 2.737120211360634, |
|
"grad_norm": 0.16871282749949595, |
|
"learning_rate": 3.985146168164281e-05, |
|
"loss": 0.5388, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 2.7476882430647294, |
|
"grad_norm": 0.1553640449532965, |
|
"learning_rate": 3.9554393238165386e-05, |
|
"loss": 0.5284, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.7582562747688244, |
|
"grad_norm": 0.15753872116807552, |
|
"learning_rate": 3.9257349373947016e-05, |
|
"loss": 0.5342, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 2.7688243064729194, |
|
"grad_norm": 0.1510488137537568, |
|
"learning_rate": 3.896034647365398e-05, |
|
"loss": 0.5329, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 2.7793923381770145, |
|
"grad_norm": 0.17598454072482386, |
|
"learning_rate": 3.866340091969303e-05, |
|
"loss": 0.5344, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 2.7899603698811095, |
|
"grad_norm": 0.16623830505420029, |
|
"learning_rate": 3.836652909130774e-05, |
|
"loss": 0.5273, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.800528401585205, |
|
"grad_norm": 0.1959528022905974, |
|
"learning_rate": 3.806974736367507e-05, |
|
"loss": 0.5324, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.8110964332893, |
|
"grad_norm": 0.14859823867573574, |
|
"learning_rate": 3.7773072107002084e-05, |
|
"loss": 0.5334, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 2.821664464993395, |
|
"grad_norm": 0.19136907727546654, |
|
"learning_rate": 3.747651968562302e-05, |
|
"loss": 0.5283, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 2.83223249669749, |
|
"grad_norm": 0.1741838181803494, |
|
"learning_rate": 3.718010645709661e-05, |
|
"loss": 0.5374, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 2.842800528401585, |
|
"grad_norm": 0.15429083348412015, |
|
"learning_rate": 3.688384877130388e-05, |
|
"loss": 0.5348, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 2.85336856010568, |
|
"grad_norm": 0.19719371528147198, |
|
"learning_rate": 3.658776296954622e-05, |
|
"loss": 0.5339, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.8639365918097752, |
|
"grad_norm": 0.15789471821578627, |
|
"learning_rate": 3.629186538364408e-05, |
|
"loss": 0.5339, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 2.8745046235138707, |
|
"grad_norm": 0.16444809265187393, |
|
"learning_rate": 3.5996172335036065e-05, |
|
"loss": 0.54, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 2.8850726552179657, |
|
"grad_norm": 0.18205663719249998, |
|
"learning_rate": 3.570070013387876e-05, |
|
"loss": 0.5326, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 2.895640686922061, |
|
"grad_norm": 0.1638276415587385, |
|
"learning_rate": 3.540546507814692e-05, |
|
"loss": 0.5336, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 2.906208718626156, |
|
"grad_norm": 0.14430321990734493, |
|
"learning_rate": 3.5110483452734633e-05, |
|
"loss": 0.5333, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.916776750330251, |
|
"grad_norm": 0.15779067389210275, |
|
"learning_rate": 3.4815771528556976e-05, |
|
"loss": 0.5295, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.9273447820343463, |
|
"grad_norm": 0.16510325621420335, |
|
"learning_rate": 3.452134556165259e-05, |
|
"loss": 0.5322, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 2.9379128137384414, |
|
"grad_norm": 0.15335282480316584, |
|
"learning_rate": 3.4227221792286945e-05, |
|
"loss": 0.5266, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 2.9484808454425364, |
|
"grad_norm": 0.14207500884610488, |
|
"learning_rate": 3.393341644405657e-05, |
|
"loss": 0.5284, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 2.9590488771466315, |
|
"grad_norm": 0.14817401425997304, |
|
"learning_rate": 3.363994572299418e-05, |
|
"loss": 0.5305, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.9696169088507265, |
|
"grad_norm": 0.13417873579347814, |
|
"learning_rate": 3.3346825816674804e-05, |
|
"loss": 0.5285, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 2.9801849405548215, |
|
"grad_norm": 0.1933271122465297, |
|
"learning_rate": 3.305407289332279e-05, |
|
"loss": 0.5336, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 2.9907529722589166, |
|
"grad_norm": 0.14082276570620816, |
|
"learning_rate": 3.276170310092008e-05, |
|
"loss": 0.5314, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 3.001321003963012, |
|
"grad_norm": 0.16572738564748216, |
|
"learning_rate": 3.246973256631546e-05, |
|
"loss": 0.5233, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 3.011889035667107, |
|
"grad_norm": 0.27880171253028246, |
|
"learning_rate": 3.217817739433502e-05, |
|
"loss": 0.4778, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 3.022457067371202, |
|
"grad_norm": 0.2862871579036325, |
|
"learning_rate": 3.1887053666893834e-05, |
|
"loss": 0.4793, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 3.033025099075297, |
|
"grad_norm": 0.25228138426980523, |
|
"learning_rate": 3.159637744210888e-05, |
|
"loss": 0.4754, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 3.043593130779392, |
|
"grad_norm": 0.2079690373301947, |
|
"learning_rate": 3.130616475341332e-05, |
|
"loss": 0.4723, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 3.0541611624834872, |
|
"grad_norm": 0.18869207117374362, |
|
"learning_rate": 3.101643160867208e-05, |
|
"loss": 0.4762, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 3.0647291941875827, |
|
"grad_norm": 0.22869432322360056, |
|
"learning_rate": 3.072719398929887e-05, |
|
"loss": 0.4737, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 3.0752972258916778, |
|
"grad_norm": 0.15890007803087935, |
|
"learning_rate": 3.0438467849374702e-05, |
|
"loss": 0.473, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 3.085865257595773, |
|
"grad_norm": 0.210513738135415, |
|
"learning_rate": 3.0150269114767862e-05, |
|
"loss": 0.4767, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 3.096433289299868, |
|
"grad_norm": 0.18412299124477924, |
|
"learning_rate": 2.9862613682255383e-05, |
|
"loss": 0.4761, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 3.107001321003963, |
|
"grad_norm": 0.16086763510358643, |
|
"learning_rate": 2.957551741864631e-05, |
|
"loss": 0.4743, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 3.117569352708058, |
|
"grad_norm": 0.1836629217850037, |
|
"learning_rate": 2.928899615990643e-05, |
|
"loss": 0.475, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 3.1281373844121534, |
|
"grad_norm": 0.16351418203553833, |
|
"learning_rate": 2.90030657102848e-05, |
|
"loss": 0.478, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 3.1387054161162484, |
|
"grad_norm": 0.1543546082591788, |
|
"learning_rate": 2.8717741841441964e-05, |
|
"loss": 0.4736, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 3.1492734478203435, |
|
"grad_norm": 0.14811662444817492, |
|
"learning_rate": 2.8433040291580053e-05, |
|
"loss": 0.4814, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 3.1598414795244385, |
|
"grad_norm": 0.15161900774346382, |
|
"learning_rate": 2.8148976764574648e-05, |
|
"loss": 0.4723, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 3.1704095112285335, |
|
"grad_norm": 0.14342849635394805, |
|
"learning_rate": 2.7865566929108573e-05, |
|
"loss": 0.4761, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 3.180977542932629, |
|
"grad_norm": 0.1660043564516897, |
|
"learning_rate": 2.758282641780762e-05, |
|
"loss": 0.4797, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 3.191545574636724, |
|
"grad_norm": 0.141920732705004, |
|
"learning_rate": 2.7300770826378302e-05, |
|
"loss": 0.4782, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 3.202113606340819, |
|
"grad_norm": 0.14950725725564776, |
|
"learning_rate": 2.7019415712747558e-05, |
|
"loss": 0.4748, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 3.212681638044914, |
|
"grad_norm": 0.14208430168695735, |
|
"learning_rate": 2.6738776596204624e-05, |
|
"loss": 0.4748, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 3.223249669749009, |
|
"grad_norm": 0.14961017547836333, |
|
"learning_rate": 2.6458868956544984e-05, |
|
"loss": 0.4803, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 3.233817701453104, |
|
"grad_norm": 0.15645442679040064, |
|
"learning_rate": 2.6179708233216557e-05, |
|
"loss": 0.4758, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 3.2443857331571992, |
|
"grad_norm": 0.1537770287650884, |
|
"learning_rate": 2.590130982446802e-05, |
|
"loss": 0.4737, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 3.2549537648612947, |
|
"grad_norm": 0.14713329423087662, |
|
"learning_rate": 2.5623689086499496e-05, |
|
"loss": 0.4733, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 3.2655217965653898, |
|
"grad_norm": 0.15291508052862732, |
|
"learning_rate": 2.5346861332615476e-05, |
|
"loss": 0.4782, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 3.276089828269485, |
|
"grad_norm": 0.13920928762276993, |
|
"learning_rate": 2.5070841832380212e-05, |
|
"loss": 0.4709, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 3.28665785997358, |
|
"grad_norm": 0.1465474915994889, |
|
"learning_rate": 2.4795645810775414e-05, |
|
"loss": 0.4826, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 3.297225891677675, |
|
"grad_norm": 0.1316301584217805, |
|
"learning_rate": 2.4521288447360457e-05, |
|
"loss": 0.4731, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 3.3077939233817704, |
|
"grad_norm": 0.1461801975123496, |
|
"learning_rate": 2.424778487543512e-05, |
|
"loss": 0.4734, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 3.3183619550858654, |
|
"grad_norm": 0.12719699667003487, |
|
"learning_rate": 2.3975150181204817e-05, |
|
"loss": 0.4737, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 3.3289299867899604, |
|
"grad_norm": 0.13764467132837127, |
|
"learning_rate": 2.370339940294848e-05, |
|
"loss": 0.4703, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 3.3394980184940555, |
|
"grad_norm": 0.12510614551548813, |
|
"learning_rate": 2.3432547530189033e-05, |
|
"loss": 0.4786, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 3.3500660501981505, |
|
"grad_norm": 0.13577816786562447, |
|
"learning_rate": 2.316260950286661e-05, |
|
"loss": 0.4775, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 3.3606340819022456, |
|
"grad_norm": 0.13089413394472932, |
|
"learning_rate": 2.2893600210514464e-05, |
|
"loss": 0.4756, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 3.3712021136063406, |
|
"grad_norm": 0.1333288679621952, |
|
"learning_rate": 2.2625534491437672e-05, |
|
"loss": 0.4811, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 3.381770145310436, |
|
"grad_norm": 0.13920771603982868, |
|
"learning_rate": 2.2358427131894732e-05, |
|
"loss": 0.4815, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 3.392338177014531, |
|
"grad_norm": 0.12578897335151773, |
|
"learning_rate": 2.2092292865281845e-05, |
|
"loss": 0.477, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 3.402906208718626, |
|
"grad_norm": 0.1406551312359615, |
|
"learning_rate": 2.1827146371320334e-05, |
|
"loss": 0.4761, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 3.413474240422721, |
|
"grad_norm": 0.11847800398887365, |
|
"learning_rate": 2.156300227524688e-05, |
|
"loss": 0.4794, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 3.4240422721268162, |
|
"grad_norm": 0.1289493089958156, |
|
"learning_rate": 2.1299875147006838e-05, |
|
"loss": 0.4803, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 3.4346103038309117, |
|
"grad_norm": 0.12509059453118254, |
|
"learning_rate": 2.10377795004505e-05, |
|
"loss": 0.4774, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 3.4451783355350067, |
|
"grad_norm": 0.13580138707758002, |
|
"learning_rate": 2.0776729792532652e-05, |
|
"loss": 0.4733, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 3.455746367239102, |
|
"grad_norm": 0.12405647314750252, |
|
"learning_rate": 2.0516740422515022e-05, |
|
"loss": 0.4762, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 3.466314398943197, |
|
"grad_norm": 0.14580809381579368, |
|
"learning_rate": 2.0257825731172077e-05, |
|
"loss": 0.4764, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 3.476882430647292, |
|
"grad_norm": 0.1281590923924517, |
|
"learning_rate": 2.0000000000000012e-05, |
|
"loss": 0.4706, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 3.487450462351387, |
|
"grad_norm": 0.1298579667890172, |
|
"learning_rate": 1.9743277450428962e-05, |
|
"loss": 0.4725, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 3.498018494055482, |
|
"grad_norm": 0.13901143297798416, |
|
"learning_rate": 1.9487672243038594e-05, |
|
"loss": 0.4811, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 3.5085865257595774, |
|
"grad_norm": 0.12006662533988818, |
|
"learning_rate": 1.9233198476777003e-05, |
|
"loss": 0.4755, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 3.5191545574636725, |
|
"grad_norm": 0.13509800282835904, |
|
"learning_rate": 1.897987018818302e-05, |
|
"loss": 0.4736, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 3.5297225891677675, |
|
"grad_norm": 0.12372407695325513, |
|
"learning_rate": 1.8727701350612026e-05, |
|
"loss": 0.4771, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 3.5402906208718625, |
|
"grad_norm": 0.13277097757447007, |
|
"learning_rate": 1.8476705873465097e-05, |
|
"loss": 0.477, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 3.5508586525759576, |
|
"grad_norm": 0.1190584597907601, |
|
"learning_rate": 1.8226897601421858e-05, |
|
"loss": 0.472, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 3.561426684280053, |
|
"grad_norm": 0.13360122007454706, |
|
"learning_rate": 1.7978290313676774e-05, |
|
"loss": 0.4812, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 3.571994715984148, |
|
"grad_norm": 0.12353280522144573, |
|
"learning_rate": 1.7730897723179144e-05, |
|
"loss": 0.4735, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 3.582562747688243, |
|
"grad_norm": 0.12197612268234642, |
|
"learning_rate": 1.748473347587666e-05, |
|
"loss": 0.4758, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 3.593130779392338, |
|
"grad_norm": 0.13321021249065187, |
|
"learning_rate": 1.7239811149962756e-05, |
|
"loss": 0.4777, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 3.603698811096433, |
|
"grad_norm": 0.11696601002999722, |
|
"learning_rate": 1.6996144255127586e-05, |
|
"loss": 0.473, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 3.6142668428005287, |
|
"grad_norm": 0.12376625663933627, |
|
"learning_rate": 1.675374623181294e-05, |
|
"loss": 0.475, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 3.6248348745046233, |
|
"grad_norm": 0.11118193452823608, |
|
"learning_rate": 1.6512630450470784e-05, |
|
"loss": 0.4679, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 3.6354029062087188, |
|
"grad_norm": 0.1194452215053386, |
|
"learning_rate": 1.6272810210825794e-05, |
|
"loss": 0.4748, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 3.645970937912814, |
|
"grad_norm": 0.12011227175979526, |
|
"learning_rate": 1.6034298741141768e-05, |
|
"loss": 0.4766, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 3.656538969616909, |
|
"grad_norm": 0.10988264863047191, |
|
"learning_rate": 1.579710919749196e-05, |
|
"loss": 0.4735, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 3.667107001321004, |
|
"grad_norm": 0.11947936011899343, |
|
"learning_rate": 1.5561254663033393e-05, |
|
"loss": 0.4801, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 3.677675033025099, |
|
"grad_norm": 0.11387001376651971, |
|
"learning_rate": 1.53267481472852e-05, |
|
"loss": 0.4761, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 3.6882430647291944, |
|
"grad_norm": 0.10828606028620547, |
|
"learning_rate": 1.5093602585411078e-05, |
|
"loss": 0.4745, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 3.6988110964332894, |
|
"grad_norm": 0.1158137481167655, |
|
"learning_rate": 1.4861830837505733e-05, |
|
"loss": 0.4739, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 3.7093791281373845, |
|
"grad_norm": 0.11883853717597995, |
|
"learning_rate": 1.4631445687885553e-05, |
|
"loss": 0.479, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 3.7199471598414795, |
|
"grad_norm": 0.10711524384366743, |
|
"learning_rate": 1.4402459844383451e-05, |
|
"loss": 0.4751, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 3.7305151915455745, |
|
"grad_norm": 0.1179009653082842, |
|
"learning_rate": 1.4174885937647905e-05, |
|
"loss": 0.4782, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 3.74108322324967, |
|
"grad_norm": 0.11366521654185796, |
|
"learning_rate": 1.3948736520446246e-05, |
|
"loss": 0.4727, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 3.7516512549537646, |
|
"grad_norm": 0.10920060902225881, |
|
"learning_rate": 1.372402406697229e-05, |
|
"loss": 0.4725, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 3.76221928665786, |
|
"grad_norm": 0.12313189801852109, |
|
"learning_rate": 1.3500760972158223e-05, |
|
"loss": 0.4809, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 3.772787318361955, |
|
"grad_norm": 0.11049761112743338, |
|
"learning_rate": 1.3278959550991011e-05, |
|
"loss": 0.4782, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 3.78335535006605, |
|
"grad_norm": 0.10430827010366775, |
|
"learning_rate": 1.3058632037832957e-05, |
|
"loss": 0.4743, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 3.793923381770145, |
|
"grad_norm": 0.11402898251067686, |
|
"learning_rate": 1.2839790585747008e-05, |
|
"loss": 0.4767, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 3.8044914134742402, |
|
"grad_norm": 0.11162325433379153, |
|
"learning_rate": 1.2622447265826345e-05, |
|
"loss": 0.4812, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.8150594451783357, |
|
"grad_norm": 0.10511190972273543, |
|
"learning_rate": 1.2406614066528543e-05, |
|
"loss": 0.4768, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 3.8256274768824308, |
|
"grad_norm": 0.10681643814143399, |
|
"learning_rate": 1.219230289301431e-05, |
|
"loss": 0.4717, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 3.836195508586526, |
|
"grad_norm": 0.11345833004338865, |
|
"learning_rate": 1.1979525566490845e-05, |
|
"loss": 0.4694, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 3.846763540290621, |
|
"grad_norm": 0.10629326422923135, |
|
"learning_rate": 1.176829382355973e-05, |
|
"loss": 0.475, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 3.857331571994716, |
|
"grad_norm": 0.10487140068250281, |
|
"learning_rate": 1.1558619315569572e-05, |
|
"loss": 0.4727, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 3.8678996036988114, |
|
"grad_norm": 0.10353148274468497, |
|
"learning_rate": 1.1350513607973351e-05, |
|
"loss": 0.4749, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 3.878467635402906, |
|
"grad_norm": 0.11029487277113502, |
|
"learning_rate": 1.1143988179690441e-05, |
|
"loss": 0.4756, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 3.8890356671070014, |
|
"grad_norm": 0.10444430423294958, |
|
"learning_rate": 1.093905442247348e-05, |
|
"loss": 0.4715, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 3.8996036988110965, |
|
"grad_norm": 0.10279747300542211, |
|
"learning_rate": 1.073572364027999e-05, |
|
"loss": 0.4701, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 3.9101717305151915, |
|
"grad_norm": 0.10814918538118634, |
|
"learning_rate": 1.0534007048648846e-05, |
|
"loss": 0.4773, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.9207397622192866, |
|
"grad_norm": 0.10568724216385697, |
|
"learning_rate": 1.0333915774081698e-05, |
|
"loss": 0.4742, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 3.9313077939233816, |
|
"grad_norm": 0.10446279448267286, |
|
"learning_rate": 1.0135460853429166e-05, |
|
"loss": 0.4703, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 3.941875825627477, |
|
"grad_norm": 0.10539141636554224, |
|
"learning_rate": 9.938653233282105e-06, |
|
"loss": 0.4788, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 3.952443857331572, |
|
"grad_norm": 0.10404420155804553, |
|
"learning_rate": 9.74350376936779e-06, |
|
"loss": 0.4766, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 3.963011889035667, |
|
"grad_norm": 0.11291514851116513, |
|
"learning_rate": 9.550023225951124e-06, |
|
"loss": 0.4786, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 3.973579920739762, |
|
"grad_norm": 0.11054229268613286, |
|
"learning_rate": 9.358222275240884e-06, |
|
"loss": 0.4715, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 3.984147952443857, |
|
"grad_norm": 0.10047846272456959, |
|
"learning_rate": 9.168111496801071e-06, |
|
"loss": 0.4767, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 3.9947159841479527, |
|
"grad_norm": 0.10427667029654042, |
|
"learning_rate": 8.979701376967313e-06, |
|
"loss": 0.472, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 4.005284015852047, |
|
"grad_norm": 0.15160742471081606, |
|
"learning_rate": 8.793002308268521e-06, |
|
"loss": 0.4599, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 4.015852047556143, |
|
"grad_norm": 0.18761042591080226, |
|
"learning_rate": 8.608024588853561e-06, |
|
"loss": 0.4482, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 4.026420079260237, |
|
"grad_norm": 0.12647969828019093, |
|
"learning_rate": 8.424778421923258e-06, |
|
"loss": 0.4377, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 4.036988110964333, |
|
"grad_norm": 0.1707261073721691, |
|
"learning_rate": 8.24327391516761e-06, |
|
"loss": 0.4438, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 4.047556142668428, |
|
"grad_norm": 0.20005656576433853, |
|
"learning_rate": 8.06352108020823e-06, |
|
"loss": 0.4427, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 4.058124174372523, |
|
"grad_norm": 0.14945783232518764, |
|
"learning_rate": 7.885529832046134e-06, |
|
"loss": 0.4404, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 4.068692206076618, |
|
"grad_norm": 0.13331270252131155, |
|
"learning_rate": 7.709309988514824e-06, |
|
"loss": 0.438, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 4.079260237780713, |
|
"grad_norm": 0.1373177545653382, |
|
"learning_rate": 7.534871269738753e-06, |
|
"loss": 0.442, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 4.0898282694848085, |
|
"grad_norm": 0.14428406658074794, |
|
"learning_rate": 7.362223297597184e-06, |
|
"loss": 0.444, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 4.100396301188904, |
|
"grad_norm": 0.1403209142453575, |
|
"learning_rate": 7.191375595193433e-06, |
|
"loss": 0.4423, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 4.110964332892999, |
|
"grad_norm": 0.12198908241232029, |
|
"learning_rate": 7.022337586329597e-06, |
|
"loss": 0.4436, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 4.121532364597094, |
|
"grad_norm": 0.11000767534504179, |
|
"learning_rate": 6.855118594986718e-06, |
|
"loss": 0.4443, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 4.132100396301189, |
|
"grad_norm": 0.1243765198168875, |
|
"learning_rate": 6.6897278448105405e-06, |
|
"loss": 0.4389, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 4.142668428005284, |
|
"grad_norm": 0.1326492943126363, |
|
"learning_rate": 6.526174458602681e-06, |
|
"loss": 0.4461, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 4.153236459709379, |
|
"grad_norm": 0.11686776812880073, |
|
"learning_rate": 6.364467457817482e-06, |
|
"loss": 0.4384, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 4.163804491413474, |
|
"grad_norm": 0.10054607780091933, |
|
"learning_rate": 6.20461576206433e-06, |
|
"loss": 0.4389, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 4.17437252311757, |
|
"grad_norm": 0.11593044593000827, |
|
"learning_rate": 6.046628188615718e-06, |
|
"loss": 0.4398, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 4.184940554821664, |
|
"grad_norm": 0.11430820998845044, |
|
"learning_rate": 5.890513451920843e-06, |
|
"loss": 0.4367, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 4.19550858652576, |
|
"grad_norm": 0.11287261939767183, |
|
"learning_rate": 5.736280163124974e-06, |
|
"loss": 0.4444, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 4.206076618229854, |
|
"grad_norm": 0.10321211605129811, |
|
"learning_rate": 5.583936829594434e-06, |
|
"loss": 0.4436, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 4.21664464993395, |
|
"grad_norm": 0.10361621115480053, |
|
"learning_rate": 5.4334918544473436e-06, |
|
"loss": 0.4405, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 4.227212681638045, |
|
"grad_norm": 0.10863502177296244, |
|
"learning_rate": 5.284953536090131e-06, |
|
"loss": 0.443, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 4.23778071334214, |
|
"grad_norm": 0.10536497157476482, |
|
"learning_rate": 5.1383300677598024e-06, |
|
"loss": 0.4497, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 4.248348745046235, |
|
"grad_norm": 0.10402891881587309, |
|
"learning_rate": 4.993629537071978e-06, |
|
"loss": 0.4396, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 4.25891677675033, |
|
"grad_norm": 0.10369536487473348, |
|
"learning_rate": 4.850859925574809e-06, |
|
"loss": 0.4393, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 4.2694848084544255, |
|
"grad_norm": 0.10111203263697728, |
|
"learning_rate": 4.710029108308733e-06, |
|
"loss": 0.441, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 4.28005284015852, |
|
"grad_norm": 0.09912151235410817, |
|
"learning_rate": 4.571144853372063e-06, |
|
"loss": 0.4403, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 4.2906208718626155, |
|
"grad_norm": 0.09757157331199569, |
|
"learning_rate": 4.434214821492542e-06, |
|
"loss": 0.4473, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 4.301188903566711, |
|
"grad_norm": 0.1002102756021715, |
|
"learning_rate": 4.299246565604756e-06, |
|
"loss": 0.4383, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 4.311756935270806, |
|
"grad_norm": 0.10119754294494109, |
|
"learning_rate": 4.166247530433531e-06, |
|
"loss": 0.4363, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 4.322324966974901, |
|
"grad_norm": 0.10100539004557267, |
|
"learning_rate": 4.035225052083309e-06, |
|
"loss": 0.4406, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 4.332892998678996, |
|
"grad_norm": 0.09823672032855243, |
|
"learning_rate": 3.906186357633455e-06, |
|
"loss": 0.4432, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 4.343461030383091, |
|
"grad_norm": 0.09520265410670203, |
|
"learning_rate": 3.779138564739646e-06, |
|
"loss": 0.436, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 4.354029062087187, |
|
"grad_norm": 0.10024695357191145, |
|
"learning_rate": 3.6540886812412547e-06, |
|
"loss": 0.4405, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 4.364597093791281, |
|
"grad_norm": 0.10079785013841713, |
|
"learning_rate": 3.5310436047748263e-06, |
|
"loss": 0.4403, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 4.375165125495377, |
|
"grad_norm": 0.09516762618496769, |
|
"learning_rate": 3.4100101223935743e-06, |
|
"loss": 0.4408, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 4.385733157199471, |
|
"grad_norm": 0.09404930151960895, |
|
"learning_rate": 3.290994910193037e-06, |
|
"loss": 0.4442, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 4.396301188903567, |
|
"grad_norm": 0.09531190586303208, |
|
"learning_rate": 3.174004532942845e-06, |
|
"loss": 0.4451, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 4.406869220607662, |
|
"grad_norm": 0.0951071973632443, |
|
"learning_rate": 3.059045443724582e-06, |
|
"loss": 0.4423, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 4.417437252311757, |
|
"grad_norm": 0.09705103462701054, |
|
"learning_rate": 2.946123983575846e-06, |
|
"loss": 0.4372, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 4.428005284015852, |
|
"grad_norm": 0.09365563642961511, |
|
"learning_rate": 2.8352463811404952e-06, |
|
"loss": 0.4397, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 4.438573315719947, |
|
"grad_norm": 0.09694934512919057, |
|
"learning_rate": 2.726418752325084e-06, |
|
"loss": 0.4465, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 4.449141347424042, |
|
"grad_norm": 0.09546394710630239, |
|
"learning_rate": 2.619647099961502e-06, |
|
"loss": 0.4457, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 4.459709379128137, |
|
"grad_norm": 0.09255730663301787, |
|
"learning_rate": 2.514937313475865e-06, |
|
"loss": 0.4405, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 4.4702774108322325, |
|
"grad_norm": 0.09338235711048254, |
|
"learning_rate": 2.4122951685636674e-06, |
|
"loss": 0.4395, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 4.480845442536328, |
|
"grad_norm": 0.09435019171855656, |
|
"learning_rate": 2.3117263268712e-06, |
|
"loss": 0.44, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 4.491413474240423, |
|
"grad_norm": 0.09413409031470825, |
|
"learning_rate": 2.2132363356832532e-06, |
|
"loss": 0.4432, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 4.501981505944518, |
|
"grad_norm": 0.09468855592447012, |
|
"learning_rate": 2.11683062761713e-06, |
|
"loss": 0.4422, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 4.512549537648613, |
|
"grad_norm": 0.09055553100177934, |
|
"learning_rate": 2.0225145203230044e-06, |
|
"loss": 0.4401, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 4.523117569352708, |
|
"grad_norm": 0.09082079754983816, |
|
"learning_rate": 1.930293216190586e-06, |
|
"loss": 0.4367, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 4.533685601056803, |
|
"grad_norm": 0.09114595588995444, |
|
"learning_rate": 1.8401718020621694e-06, |
|
"loss": 0.4383, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 4.544253632760898, |
|
"grad_norm": 0.09147630992219807, |
|
"learning_rate": 1.7521552489520566e-06, |
|
"loss": 0.4455, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 4.554821664464994, |
|
"grad_norm": 0.09327852060770747, |
|
"learning_rate": 1.666248411772342e-06, |
|
"loss": 0.441, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 4.565389696169088, |
|
"grad_norm": 0.09307772965424148, |
|
"learning_rate": 1.5824560290651404e-06, |
|
"loss": 0.444, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 4.575957727873184, |
|
"grad_norm": 0.09415356635848661, |
|
"learning_rate": 1.5007827227411942e-06, |
|
"loss": 0.4417, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 4.586525759577278, |
|
"grad_norm": 0.09176517213906957, |
|
"learning_rate": 1.4212329978249417e-06, |
|
"loss": 0.4425, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 4.597093791281374, |
|
"grad_norm": 0.09036761344520468, |
|
"learning_rate": 1.3438112422060256e-06, |
|
"loss": 0.4416, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 4.607661822985469, |
|
"grad_norm": 0.0920008670611069, |
|
"learning_rate": 1.2685217263972693e-06, |
|
"loss": 0.4433, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 4.618229854689564, |
|
"grad_norm": 0.08954221258416922, |
|
"learning_rate": 1.1953686032990964e-06, |
|
"loss": 0.4431, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 4.628797886393659, |
|
"grad_norm": 0.0897729908602636, |
|
"learning_rate": 1.124355907970487e-06, |
|
"loss": 0.4451, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 4.639365918097754, |
|
"grad_norm": 0.09192361815684204, |
|
"learning_rate": 1.0554875574063784e-06, |
|
"loss": 0.4423, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 4.6499339498018495, |
|
"grad_norm": 0.09033170522999512, |
|
"learning_rate": 9.887673503216422e-07, |
|
"loss": 0.4394, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 4.660501981505945, |
|
"grad_norm": 0.09133489415949655, |
|
"learning_rate": 9.241989669415097e-07, |
|
"loss": 0.4447, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 4.67107001321004, |
|
"grad_norm": 0.09425014878930163, |
|
"learning_rate": 8.61785968798623e-07, |
|
"loss": 0.4397, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 4.681638044914135, |
|
"grad_norm": 0.09161607478315605, |
|
"learning_rate": 8.015317985365301e-07, |
|
"loss": 0.4435, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 4.69220607661823, |
|
"grad_norm": 0.0915131016919999, |
|
"learning_rate": 7.434397797198367e-07, |
|
"loss": 0.4392, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 4.702774108322325, |
|
"grad_norm": 0.09011982457022855, |
|
"learning_rate": 6.875131166508553e-07, |
|
"loss": 0.4429, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 4.71334214002642, |
|
"grad_norm": 0.09105298330396881, |
|
"learning_rate": 6.337548941928839e-07, |
|
"loss": 0.4452, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 4.723910171730515, |
|
"grad_norm": 0.08866673271037578, |
|
"learning_rate": 5.821680776000049e-07, |
|
"loss": 0.4422, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 4.734478203434611, |
|
"grad_norm": 0.08801181304315894, |
|
"learning_rate": 5.32755512353571e-07, |
|
"loss": 0.436, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 4.745046235138705, |
|
"grad_norm": 0.0892771815810736, |
|
"learning_rate": 4.8551992400522e-07, |
|
"loss": 0.4452, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 4.755614266842801, |
|
"grad_norm": 0.08829094089099762, |
|
"learning_rate": 4.4046391802655463e-07, |
|
"loss": 0.4422, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 4.766182298546895, |
|
"grad_norm": 0.09129909001389838, |
|
"learning_rate": 3.975899796654137e-07, |
|
"loss": 0.443, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 4.776750330250991, |
|
"grad_norm": 0.0898229105447953, |
|
"learning_rate": 3.569004738087989e-07, |
|
"loss": 0.4408, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 4.787318361955085, |
|
"grad_norm": 0.08880239621448514, |
|
"learning_rate": 3.183976448524106e-07, |
|
"loss": 0.4375, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 4.797886393659181, |
|
"grad_norm": 0.08857557909002102, |
|
"learning_rate": 2.8208361657688474e-07, |
|
"loss": 0.4422, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 4.808454425363276, |
|
"grad_norm": 0.08766599065540119, |
|
"learning_rate": 2.479603920306106e-07, |
|
"loss": 0.4358, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 4.819022457067371, |
|
"grad_norm": 0.09087441316527663, |
|
"learning_rate": 2.1602985341925953e-07, |
|
"loss": 0.4366, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 4.8295904887714665, |
|
"grad_norm": 0.0907938224300551, |
|
"learning_rate": 1.8629376200197004e-07, |
|
"loss": 0.442, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 4.840158520475561, |
|
"grad_norm": 0.08888968582423885, |
|
"learning_rate": 1.5875375799419e-07, |
|
"loss": 0.4358, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 4.8507265521796565, |
|
"grad_norm": 0.08900710504460911, |
|
"learning_rate": 1.3341136047719805e-07, |
|
"loss": 0.4367, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 4.861294583883752, |
|
"grad_norm": 0.08986360536362852, |
|
"learning_rate": 1.1026796731433475e-07, |
|
"loss": 0.4414, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 4.871862615587847, |
|
"grad_norm": 0.08885451973141226, |
|
"learning_rate": 8.932485507387345e-08, |
|
"loss": 0.4368, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 4.882430647291942, |
|
"grad_norm": 0.08728339714966311, |
|
"learning_rate": 7.058317895861866e-08, |
|
"loss": 0.445, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 4.892998678996037, |
|
"grad_norm": 0.08758661927375506, |
|
"learning_rate": 5.40439727421882e-08, |
|
"loss": 0.442, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 4.903566710700132, |
|
"grad_norm": 0.0906444113153008, |
|
"learning_rate": 3.970814871197437e-08, |
|
"loss": 0.4387, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 4.914134742404228, |
|
"grad_norm": 0.08754952023250234, |
|
"learning_rate": 2.75764976188464e-08, |
|
"loss": 0.4401, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 4.924702774108322, |
|
"grad_norm": 0.08803257540350204, |
|
"learning_rate": 1.764968863351424e-08, |
|
"loss": 0.4401, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 4.935270805812418, |
|
"grad_norm": 0.08863746420839871, |
|
"learning_rate": 9.928269309638083e-09, |
|
"loss": 0.4398, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 4.945838837516512, |
|
"grad_norm": 0.08825690980969667, |
|
"learning_rate": 4.412665553594764e-09, |
|
"loss": 0.4478, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 4.956406869220608, |
|
"grad_norm": 0.09184591938217317, |
|
"learning_rate": 1.103181601020964e-09, |
|
"loss": 0.4439, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 4.966974900924702, |
|
"grad_norm": 0.08777993584647581, |
|
"learning_rate": 0.0, |
|
"loss": 0.4386, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 4.966974900924702, |
|
"step": 470, |
|
"total_flos": 1.2094498954511647e+19, |
|
"train_loss": 0.5709579251548077, |
|
"train_runtime": 47190.5732, |
|
"train_samples_per_second": 5.131, |
|
"train_steps_per_second": 0.01 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 470, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.2094498954511647e+19, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|