no_pipeline_science_300k / trainer_state.json
neginr's picture
End of training
cb6889c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.966974900924702,
"eval_steps": 500,
"global_step": 470,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010568031704095112,
"grad_norm": 7.083772151723263,
"learning_rate": 1.7021276595744682e-06,
"loss": 1.1953,
"step": 1
},
{
"epoch": 0.021136063408190225,
"grad_norm": 7.131321562784017,
"learning_rate": 3.4042553191489363e-06,
"loss": 1.1964,
"step": 2
},
{
"epoch": 0.031704095112285335,
"grad_norm": 7.0823963229107525,
"learning_rate": 5.106382978723404e-06,
"loss": 1.2092,
"step": 3
},
{
"epoch": 0.04227212681638045,
"grad_norm": 5.141917283385601,
"learning_rate": 6.808510638297873e-06,
"loss": 1.1295,
"step": 4
},
{
"epoch": 0.052840158520475564,
"grad_norm": 2.56300415286007,
"learning_rate": 8.510638297872341e-06,
"loss": 1.077,
"step": 5
},
{
"epoch": 0.06340819022457067,
"grad_norm": 4.615211256351488,
"learning_rate": 1.0212765957446808e-05,
"loss": 1.0452,
"step": 6
},
{
"epoch": 0.07397622192866579,
"grad_norm": 4.783764272721008,
"learning_rate": 1.1914893617021277e-05,
"loss": 1.0356,
"step": 7
},
{
"epoch": 0.0845442536327609,
"grad_norm": 3.8154182115223816,
"learning_rate": 1.3617021276595745e-05,
"loss": 0.9744,
"step": 8
},
{
"epoch": 0.095112285336856,
"grad_norm": 3.8637000758137723,
"learning_rate": 1.5319148936170214e-05,
"loss": 0.9714,
"step": 9
},
{
"epoch": 0.10568031704095113,
"grad_norm": 2.366476473063925,
"learning_rate": 1.7021276595744682e-05,
"loss": 0.9574,
"step": 10
},
{
"epoch": 0.11624834874504623,
"grad_norm": 2.994957368837667,
"learning_rate": 1.872340425531915e-05,
"loss": 0.9216,
"step": 11
},
{
"epoch": 0.12681638044914134,
"grad_norm": 2.6457297468520014,
"learning_rate": 2.0425531914893616e-05,
"loss": 0.9039,
"step": 12
},
{
"epoch": 0.13738441215323646,
"grad_norm": 1.6507084509279148,
"learning_rate": 2.2127659574468088e-05,
"loss": 0.8965,
"step": 13
},
{
"epoch": 0.14795244385733158,
"grad_norm": 1.726225549814637,
"learning_rate": 2.3829787234042553e-05,
"loss": 0.8849,
"step": 14
},
{
"epoch": 0.15852047556142668,
"grad_norm": 1.6588209807553262,
"learning_rate": 2.5531914893617025e-05,
"loss": 0.871,
"step": 15
},
{
"epoch": 0.1690885072655218,
"grad_norm": 1.2143860211983681,
"learning_rate": 2.723404255319149e-05,
"loss": 0.8617,
"step": 16
},
{
"epoch": 0.17965653896961692,
"grad_norm": 1.2868680178333174,
"learning_rate": 2.8936170212765963e-05,
"loss": 0.8567,
"step": 17
},
{
"epoch": 0.190224570673712,
"grad_norm": 1.2037257451837027,
"learning_rate": 3.063829787234043e-05,
"loss": 0.8434,
"step": 18
},
{
"epoch": 0.20079260237780713,
"grad_norm": 1.041214478985964,
"learning_rate": 3.234042553191489e-05,
"loss": 0.8371,
"step": 19
},
{
"epoch": 0.21136063408190225,
"grad_norm": 0.9037174605431649,
"learning_rate": 3.4042553191489365e-05,
"loss": 0.825,
"step": 20
},
{
"epoch": 0.22192866578599735,
"grad_norm": 1.3140952321772827,
"learning_rate": 3.574468085106383e-05,
"loss": 0.8314,
"step": 21
},
{
"epoch": 0.23249669749009247,
"grad_norm": 1.3446525845289001,
"learning_rate": 3.74468085106383e-05,
"loss": 0.8274,
"step": 22
},
{
"epoch": 0.2430647291941876,
"grad_norm": 1.202878622232685,
"learning_rate": 3.914893617021277e-05,
"loss": 0.8126,
"step": 23
},
{
"epoch": 0.2536327608982827,
"grad_norm": 0.9767090732567484,
"learning_rate": 4.085106382978723e-05,
"loss": 0.806,
"step": 24
},
{
"epoch": 0.26420079260237783,
"grad_norm": 1.7139214140351846,
"learning_rate": 4.2553191489361704e-05,
"loss": 0.8059,
"step": 25
},
{
"epoch": 0.2747688243064729,
"grad_norm": 1.0610822836711675,
"learning_rate": 4.4255319148936176e-05,
"loss": 0.7991,
"step": 26
},
{
"epoch": 0.285336856010568,
"grad_norm": 0.9499105971028553,
"learning_rate": 4.595744680851065e-05,
"loss": 0.7972,
"step": 27
},
{
"epoch": 0.29590488771466317,
"grad_norm": 1.7217798288469996,
"learning_rate": 4.765957446808511e-05,
"loss": 0.7937,
"step": 28
},
{
"epoch": 0.30647291941875826,
"grad_norm": 1.0574506769778116,
"learning_rate": 4.936170212765958e-05,
"loss": 0.7897,
"step": 29
},
{
"epoch": 0.31704095112285335,
"grad_norm": 1.9202057568924744,
"learning_rate": 5.106382978723405e-05,
"loss": 0.7991,
"step": 30
},
{
"epoch": 0.3276089828269485,
"grad_norm": 1.345605492975133,
"learning_rate": 5.276595744680851e-05,
"loss": 0.7846,
"step": 31
},
{
"epoch": 0.3381770145310436,
"grad_norm": 1.6956356026369743,
"learning_rate": 5.446808510638298e-05,
"loss": 0.791,
"step": 32
},
{
"epoch": 0.3487450462351387,
"grad_norm": 1.5678728230382326,
"learning_rate": 5.617021276595745e-05,
"loss": 0.7865,
"step": 33
},
{
"epoch": 0.35931307793923384,
"grad_norm": 1.1508014915876,
"learning_rate": 5.7872340425531925e-05,
"loss": 0.7796,
"step": 34
},
{
"epoch": 0.36988110964332893,
"grad_norm": 1.5510984064838533,
"learning_rate": 5.9574468085106384e-05,
"loss": 0.7771,
"step": 35
},
{
"epoch": 0.380449141347424,
"grad_norm": 1.400660302660845,
"learning_rate": 6.127659574468086e-05,
"loss": 0.7693,
"step": 36
},
{
"epoch": 0.3910171730515192,
"grad_norm": 1.390600461038752,
"learning_rate": 6.297872340425533e-05,
"loss": 0.7647,
"step": 37
},
{
"epoch": 0.40158520475561427,
"grad_norm": 1.3619693240026056,
"learning_rate": 6.468085106382979e-05,
"loss": 0.7704,
"step": 38
},
{
"epoch": 0.41215323645970936,
"grad_norm": 1.4593264335764384,
"learning_rate": 6.638297872340426e-05,
"loss": 0.7534,
"step": 39
},
{
"epoch": 0.4227212681638045,
"grad_norm": 1.3716746153165715,
"learning_rate": 6.808510638297873e-05,
"loss": 0.7614,
"step": 40
},
{
"epoch": 0.4332892998678996,
"grad_norm": 1.6638664908483813,
"learning_rate": 6.97872340425532e-05,
"loss": 0.7562,
"step": 41
},
{
"epoch": 0.4438573315719947,
"grad_norm": 1.1519315113645128,
"learning_rate": 7.148936170212766e-05,
"loss": 0.7493,
"step": 42
},
{
"epoch": 0.45442536327608984,
"grad_norm": 1.2801946128196509,
"learning_rate": 7.319148936170213e-05,
"loss": 0.7488,
"step": 43
},
{
"epoch": 0.46499339498018494,
"grad_norm": 63.9505110490317,
"learning_rate": 7.48936170212766e-05,
"loss": 0.7697,
"step": 44
},
{
"epoch": 0.47556142668428003,
"grad_norm": 2.589351765723454,
"learning_rate": 7.659574468085108e-05,
"loss": 0.7888,
"step": 45
},
{
"epoch": 0.4861294583883752,
"grad_norm": 1.302706513183853,
"learning_rate": 7.829787234042553e-05,
"loss": 0.7518,
"step": 46
},
{
"epoch": 0.4966974900924703,
"grad_norm": 1.968548255582277,
"learning_rate": 8e-05,
"loss": 0.771,
"step": 47
},
{
"epoch": 0.5072655217965654,
"grad_norm": 1.4317196073460219,
"learning_rate": 7.999889681839899e-05,
"loss": 0.7447,
"step": 48
},
{
"epoch": 0.5178335535006605,
"grad_norm": 1.4264803539125535,
"learning_rate": 7.999558733444641e-05,
"loss": 0.7495,
"step": 49
},
{
"epoch": 0.5284015852047557,
"grad_norm": 1.8250063568248835,
"learning_rate": 7.999007173069037e-05,
"loss": 0.7507,
"step": 50
},
{
"epoch": 0.5389696169088507,
"grad_norm": 32.42595517441195,
"learning_rate": 7.998235031136648e-05,
"loss": 0.8633,
"step": 51
},
{
"epoch": 0.5495376486129459,
"grad_norm": 2.457580825395341,
"learning_rate": 7.997242350238117e-05,
"loss": 0.7888,
"step": 52
},
{
"epoch": 0.560105680317041,
"grad_norm": 1.073949064797192,
"learning_rate": 7.996029185128804e-05,
"loss": 0.7334,
"step": 53
},
{
"epoch": 0.570673712021136,
"grad_norm": 1.7771625101062152,
"learning_rate": 7.994595602725781e-05,
"loss": 0.7529,
"step": 54
},
{
"epoch": 0.5812417437252312,
"grad_norm": 1.461447259519702,
"learning_rate": 7.992941682104139e-05,
"loss": 0.7563,
"step": 55
},
{
"epoch": 0.5918097754293263,
"grad_norm": 1.3789190576230315,
"learning_rate": 7.991067514492614e-05,
"loss": 0.7376,
"step": 56
},
{
"epoch": 0.6023778071334214,
"grad_norm": 1.0632278085259756,
"learning_rate": 7.988973203268567e-05,
"loss": 0.7418,
"step": 57
},
{
"epoch": 0.6129458388375165,
"grad_norm": 1.3753612784385743,
"learning_rate": 7.986658863952281e-05,
"loss": 0.7302,
"step": 58
},
{
"epoch": 0.6235138705416117,
"grad_norm": 1.3066179113984184,
"learning_rate": 7.984124624200583e-05,
"loss": 0.7397,
"step": 59
},
{
"epoch": 0.6340819022457067,
"grad_norm": 1.1484727063789566,
"learning_rate": 7.981370623799803e-05,
"loss": 0.7255,
"step": 60
},
{
"epoch": 0.6446499339498019,
"grad_norm": 0.992784588525566,
"learning_rate": 7.978397014658075e-05,
"loss": 0.7272,
"step": 61
},
{
"epoch": 0.655217965653897,
"grad_norm": 0.9709816709205683,
"learning_rate": 7.97520396079694e-05,
"loss": 0.7244,
"step": 62
},
{
"epoch": 0.665785997357992,
"grad_norm": 1.410223766064041,
"learning_rate": 7.971791638342313e-05,
"loss": 0.7277,
"step": 63
},
{
"epoch": 0.6763540290620872,
"grad_norm": 0.9859003082910416,
"learning_rate": 7.96816023551476e-05,
"loss": 0.7149,
"step": 64
},
{
"epoch": 0.6869220607661823,
"grad_norm": 1.0627873899695728,
"learning_rate": 7.964309952619121e-05,
"loss": 0.7076,
"step": 65
},
{
"epoch": 0.6974900924702774,
"grad_norm": 1.2987953031941966,
"learning_rate": 7.96024100203346e-05,
"loss": 0.7148,
"step": 66
},
{
"epoch": 0.7080581241743725,
"grad_norm": 0.9847502480003636,
"learning_rate": 7.955953608197345e-05,
"loss": 0.7192,
"step": 67
},
{
"epoch": 0.7186261558784677,
"grad_norm": 1.2534186109561034,
"learning_rate": 7.951448007599478e-05,
"loss": 0.7187,
"step": 68
},
{
"epoch": 0.7291941875825627,
"grad_norm": 1.1180727067911638,
"learning_rate": 7.946724448764644e-05,
"loss": 0.7093,
"step": 69
},
{
"epoch": 0.7397622192866579,
"grad_norm": 1.3232754557732016,
"learning_rate": 7.94178319224e-05,
"loss": 0.7124,
"step": 70
},
{
"epoch": 0.750330250990753,
"grad_norm": 0.9127404976454525,
"learning_rate": 7.936624510580712e-05,
"loss": 0.7075,
"step": 71
},
{
"epoch": 0.760898282694848,
"grad_norm": 0.769180480882775,
"learning_rate": 7.931248688334915e-05,
"loss": 0.6912,
"step": 72
},
{
"epoch": 0.7714663143989432,
"grad_norm": 0.8883585292926546,
"learning_rate": 7.925656022028017e-05,
"loss": 0.6987,
"step": 73
},
{
"epoch": 0.7820343461030383,
"grad_norm": 0.963883950564473,
"learning_rate": 7.919846820146348e-05,
"loss": 0.704,
"step": 74
},
{
"epoch": 0.7926023778071334,
"grad_norm": 0.8863497475275227,
"learning_rate": 7.913821403120139e-05,
"loss": 0.6945,
"step": 75
},
{
"epoch": 0.8031704095112285,
"grad_norm": 1.0426799292571676,
"learning_rate": 7.90758010330585e-05,
"loss": 0.6849,
"step": 76
},
{
"epoch": 0.8137384412153237,
"grad_norm": 0.8243890076961332,
"learning_rate": 7.901123264967836e-05,
"loss": 0.6979,
"step": 77
},
{
"epoch": 0.8243064729194187,
"grad_norm": 1.1652167740266006,
"learning_rate": 7.894451244259363e-05,
"loss": 0.6905,
"step": 78
},
{
"epoch": 0.8348745046235139,
"grad_norm": 1.2465660396622233,
"learning_rate": 7.887564409202953e-05,
"loss": 0.6872,
"step": 79
},
{
"epoch": 0.845442536327609,
"grad_norm": 0.5355690010674495,
"learning_rate": 7.880463139670091e-05,
"loss": 0.6848,
"step": 80
},
{
"epoch": 0.8560105680317041,
"grad_norm": 1.419104255807733,
"learning_rate": 7.873147827360273e-05,
"loss": 0.6878,
"step": 81
},
{
"epoch": 0.8665785997357992,
"grad_norm": 0.6749893210811997,
"learning_rate": 7.865618875779398e-05,
"loss": 0.678,
"step": 82
},
{
"epoch": 0.8771466314398944,
"grad_norm": 0.7732270985786504,
"learning_rate": 7.857876700217508e-05,
"loss": 0.6848,
"step": 83
},
{
"epoch": 0.8877146631439894,
"grad_norm": 0.6755156642812311,
"learning_rate": 7.849921727725882e-05,
"loss": 0.679,
"step": 84
},
{
"epoch": 0.8982826948480845,
"grad_norm": 0.6421189276197394,
"learning_rate": 7.841754397093487e-05,
"loss": 0.6728,
"step": 85
},
{
"epoch": 0.9088507265521797,
"grad_norm": 0.49153205280258144,
"learning_rate": 7.833375158822766e-05,
"loss": 0.6827,
"step": 86
},
{
"epoch": 0.9194187582562747,
"grad_norm": 0.7083774145182355,
"learning_rate": 7.824784475104795e-05,
"loss": 0.6813,
"step": 87
},
{
"epoch": 0.9299867899603699,
"grad_norm": 1.1312836778877742,
"learning_rate": 7.815982819793784e-05,
"loss": 0.6776,
"step": 88
},
{
"epoch": 0.940554821664465,
"grad_norm": 1.4566456596110349,
"learning_rate": 7.806970678380943e-05,
"loss": 0.6828,
"step": 89
},
{
"epoch": 0.9511228533685601,
"grad_norm": 0.641648090796948,
"learning_rate": 7.797748547967701e-05,
"loss": 0.6689,
"step": 90
},
{
"epoch": 0.9616908850726552,
"grad_norm": 1.2689027097316996,
"learning_rate": 7.788316937238287e-05,
"loss": 0.6887,
"step": 91
},
{
"epoch": 0.9722589167767504,
"grad_norm": 1.0288877116771369,
"learning_rate": 7.778676366431676e-05,
"loss": 0.6731,
"step": 92
},
{
"epoch": 0.9828269484808454,
"grad_norm": 1.111134713608855,
"learning_rate": 7.768827367312882e-05,
"loss": 0.6772,
"step": 93
},
{
"epoch": 0.9933949801849405,
"grad_norm": 0.9415602963246132,
"learning_rate": 7.758770483143634e-05,
"loss": 0.6764,
"step": 94
},
{
"epoch": 1.0039630118890357,
"grad_norm": 0.766369432032264,
"learning_rate": 7.748506268652415e-05,
"loss": 0.6581,
"step": 95
},
{
"epoch": 1.0145310435931307,
"grad_norm": 0.6235125904372976,
"learning_rate": 7.73803529000385e-05,
"loss": 0.639,
"step": 96
},
{
"epoch": 1.025099075297226,
"grad_norm": 0.5687108049822664,
"learning_rate": 7.727358124767491e-05,
"loss": 0.6307,
"step": 97
},
{
"epoch": 1.035667107001321,
"grad_norm": 0.7768949803155286,
"learning_rate": 7.71647536188595e-05,
"loss": 0.6287,
"step": 98
},
{
"epoch": 1.046235138705416,
"grad_norm": 1.0822134265138843,
"learning_rate": 7.705387601642416e-05,
"loss": 0.6316,
"step": 99
},
{
"epoch": 1.0568031704095113,
"grad_norm": 0.6600814825591903,
"learning_rate": 7.694095455627542e-05,
"loss": 0.6209,
"step": 100
},
{
"epoch": 1.0673712021136064,
"grad_norm": 0.5092321091554813,
"learning_rate": 7.682599546705716e-05,
"loss": 0.6268,
"step": 101
},
{
"epoch": 1.0779392338177014,
"grad_norm": 0.5538152767155732,
"learning_rate": 7.670900508980697e-05,
"loss": 0.629,
"step": 102
},
{
"epoch": 1.0885072655217967,
"grad_norm": 0.7759780022913368,
"learning_rate": 7.658998987760645e-05,
"loss": 0.6266,
"step": 103
},
{
"epoch": 1.0990752972258917,
"grad_norm": 1.0071580985859072,
"learning_rate": 7.646895639522518e-05,
"loss": 0.634,
"step": 104
},
{
"epoch": 1.1096433289299867,
"grad_norm": 0.9913392623454966,
"learning_rate": 7.634591131875875e-05,
"loss": 0.6334,
"step": 105
},
{
"epoch": 1.120211360634082,
"grad_norm": 0.9887888884996264,
"learning_rate": 7.622086143526036e-05,
"loss": 0.6271,
"step": 106
},
{
"epoch": 1.130779392338177,
"grad_norm": 1.0219278471051096,
"learning_rate": 7.609381364236655e-05,
"loss": 0.6224,
"step": 107
},
{
"epoch": 1.141347424042272,
"grad_norm": 0.9226284872759164,
"learning_rate": 7.59647749479167e-05,
"loss": 0.62,
"step": 108
},
{
"epoch": 1.1519154557463673,
"grad_norm": 0.9215854604914202,
"learning_rate": 7.583375246956648e-05,
"loss": 0.6295,
"step": 109
},
{
"epoch": 1.1624834874504624,
"grad_norm": 1.2986154846688018,
"learning_rate": 7.570075343439526e-05,
"loss": 0.6245,
"step": 110
},
{
"epoch": 1.1730515191545574,
"grad_norm": 0.5049907163605463,
"learning_rate": 7.556578517850747e-05,
"loss": 0.6244,
"step": 111
},
{
"epoch": 1.1836195508586527,
"grad_norm": 0.8014907412493998,
"learning_rate": 7.542885514662794e-05,
"loss": 0.6233,
"step": 112
},
{
"epoch": 1.1941875825627477,
"grad_norm": 1.1622767091260562,
"learning_rate": 7.528997089169128e-05,
"loss": 0.6235,
"step": 113
},
{
"epoch": 1.2047556142668427,
"grad_norm": 0.7380452282238854,
"learning_rate": 7.51491400744252e-05,
"loss": 0.6228,
"step": 114
},
{
"epoch": 1.215323645970938,
"grad_norm": 0.7597600873008943,
"learning_rate": 7.500637046292803e-05,
"loss": 0.621,
"step": 115
},
{
"epoch": 1.225891677675033,
"grad_norm": 0.7234884283262764,
"learning_rate": 7.48616699322402e-05,
"loss": 0.6194,
"step": 116
},
{
"epoch": 1.236459709379128,
"grad_norm": 0.49171596139374824,
"learning_rate": 7.471504646390987e-05,
"loss": 0.6197,
"step": 117
},
{
"epoch": 1.2470277410832233,
"grad_norm": 0.6952021893115369,
"learning_rate": 7.456650814555267e-05,
"loss": 0.6185,
"step": 118
},
{
"epoch": 1.2575957727873184,
"grad_norm": 0.7911960205209884,
"learning_rate": 7.441606317040558e-05,
"loss": 0.6204,
"step": 119
},
{
"epoch": 1.2681638044914134,
"grad_norm": 0.5604716814050085,
"learning_rate": 7.426371983687503e-05,
"loss": 0.6136,
"step": 120
},
{
"epoch": 1.2787318361955085,
"grad_norm": 0.5391019590519042,
"learning_rate": 7.410948654807916e-05,
"loss": 0.6141,
"step": 121
},
{
"epoch": 1.2892998678996037,
"grad_norm": 0.4473178756510967,
"learning_rate": 7.39533718113843e-05,
"loss": 0.618,
"step": 122
},
{
"epoch": 1.2998678996036988,
"grad_norm": 0.3676324142572552,
"learning_rate": 7.379538423793568e-05,
"loss": 0.6181,
"step": 123
},
{
"epoch": 1.310435931307794,
"grad_norm": 0.49641800089735266,
"learning_rate": 7.363553254218253e-05,
"loss": 0.6162,
"step": 124
},
{
"epoch": 1.321003963011889,
"grad_norm": 0.5596386402088841,
"learning_rate": 7.347382554139733e-05,
"loss": 0.6137,
"step": 125
},
{
"epoch": 1.331571994715984,
"grad_norm": 0.37385072844495554,
"learning_rate": 7.331027215518949e-05,
"loss": 0.6047,
"step": 126
},
{
"epoch": 1.3421400264200791,
"grad_norm": 0.46801024402125285,
"learning_rate": 7.31448814050133e-05,
"loss": 0.6074,
"step": 127
},
{
"epoch": 1.3527080581241744,
"grad_norm": 0.38199501822744514,
"learning_rate": 7.297766241367041e-05,
"loss": 0.6081,
"step": 128
},
{
"epoch": 1.3632760898282694,
"grad_norm": 0.2644636432614496,
"learning_rate": 7.280862440480658e-05,
"loss": 0.6083,
"step": 129
},
{
"epoch": 1.3738441215323647,
"grad_norm": 0.3642023310466234,
"learning_rate": 7.263777670240282e-05,
"loss": 0.6163,
"step": 130
},
{
"epoch": 1.3844121532364597,
"grad_norm": 0.3286254245460909,
"learning_rate": 7.246512873026125e-05,
"loss": 0.6105,
"step": 131
},
{
"epoch": 1.3949801849405548,
"grad_norm": 0.27645048682447476,
"learning_rate": 7.229069001148518e-05,
"loss": 0.6047,
"step": 132
},
{
"epoch": 1.40554821664465,
"grad_norm": 0.31935620714929114,
"learning_rate": 7.211447016795388e-05,
"loss": 0.6159,
"step": 133
},
{
"epoch": 1.416116248348745,
"grad_norm": 0.27637648695223127,
"learning_rate": 7.193647891979177e-05,
"loss": 0.6076,
"step": 134
},
{
"epoch": 1.42668428005284,
"grad_norm": 0.2632383089776466,
"learning_rate": 7.17567260848324e-05,
"loss": 0.6134,
"step": 135
},
{
"epoch": 1.4372523117569354,
"grad_norm": 0.36576847146483216,
"learning_rate": 7.157522157807675e-05,
"loss": 0.6097,
"step": 136
},
{
"epoch": 1.4478203434610304,
"grad_norm": 0.49829210057276635,
"learning_rate": 7.139197541114645e-05,
"loss": 0.6076,
"step": 137
},
{
"epoch": 1.4583883751651254,
"grad_norm": 0.6715032042462543,
"learning_rate": 7.120699769173149e-05,
"loss": 0.6079,
"step": 138
},
{
"epoch": 1.4689564068692207,
"grad_norm": 0.8872767611634563,
"learning_rate": 7.10202986230327e-05,
"loss": 0.6106,
"step": 139
},
{
"epoch": 1.4795244385733157,
"grad_norm": 1.3094337587734477,
"learning_rate": 7.083188850319895e-05,
"loss": 0.6249,
"step": 140
},
{
"epoch": 1.4900924702774108,
"grad_norm": 0.5732521894924236,
"learning_rate": 7.064177772475912e-05,
"loss": 0.6184,
"step": 141
},
{
"epoch": 1.500660501981506,
"grad_norm": 0.5968155749547341,
"learning_rate": 7.044997677404888e-05,
"loss": 0.6132,
"step": 142
},
{
"epoch": 1.511228533685601,
"grad_norm": 1.2108186694176202,
"learning_rate": 7.025649623063223e-05,
"loss": 0.6261,
"step": 143
},
{
"epoch": 1.521796565389696,
"grad_norm": 0.7492638271367434,
"learning_rate": 7.006134676671791e-05,
"loss": 0.6097,
"step": 144
},
{
"epoch": 1.5323645970937911,
"grad_norm": 0.520895839877394,
"learning_rate": 6.986453914657083e-05,
"loss": 0.6097,
"step": 145
},
{
"epoch": 1.5429326287978864,
"grad_norm": 0.6782854061408391,
"learning_rate": 6.96660842259183e-05,
"loss": 0.6175,
"step": 146
},
{
"epoch": 1.5535006605019817,
"grad_norm": 0.5917212461488441,
"learning_rate": 6.946599295135116e-05,
"loss": 0.6142,
"step": 147
},
{
"epoch": 1.5640686922060767,
"grad_norm": 0.5981371046939804,
"learning_rate": 6.926427635972003e-05,
"loss": 0.6083,
"step": 148
},
{
"epoch": 1.5746367239101717,
"grad_norm": 0.8111312440510982,
"learning_rate": 6.906094557752654e-05,
"loss": 0.6126,
"step": 149
},
{
"epoch": 1.5852047556142668,
"grad_norm": 0.9645422516897456,
"learning_rate": 6.885601182030958e-05,
"loss": 0.6143,
"step": 150
},
{
"epoch": 1.5957727873183618,
"grad_norm": 0.8593693771282125,
"learning_rate": 6.864948639202667e-05,
"loss": 0.6165,
"step": 151
},
{
"epoch": 1.606340819022457,
"grad_norm": 0.6782193407228015,
"learning_rate": 6.844138068443043e-05,
"loss": 0.617,
"step": 152
},
{
"epoch": 1.6169088507265523,
"grad_norm": 0.5133505641873833,
"learning_rate": 6.823170617644029e-05,
"loss": 0.6097,
"step": 153
},
{
"epoch": 1.6274768824306474,
"grad_norm": 0.3188527654318755,
"learning_rate": 6.802047443350915e-05,
"loss": 0.6019,
"step": 154
},
{
"epoch": 1.6380449141347424,
"grad_norm": 0.36046564912487544,
"learning_rate": 6.78076971069857e-05,
"loss": 0.603,
"step": 155
},
{
"epoch": 1.6486129458388374,
"grad_norm": 0.3434440344937455,
"learning_rate": 6.759338593347148e-05,
"loss": 0.614,
"step": 156
},
{
"epoch": 1.6591809775429325,
"grad_norm": 0.4207528199987266,
"learning_rate": 6.737755273417367e-05,
"loss": 0.6057,
"step": 157
},
{
"epoch": 1.6697490092470277,
"grad_norm": 0.4376375758003335,
"learning_rate": 6.716020941425302e-05,
"loss": 0.6101,
"step": 158
},
{
"epoch": 1.680317040951123,
"grad_norm": 0.3670986836675996,
"learning_rate": 6.694136796216706e-05,
"loss": 0.6074,
"step": 159
},
{
"epoch": 1.690885072655218,
"grad_norm": 0.3716710535382814,
"learning_rate": 6.672104044900901e-05,
"loss": 0.607,
"step": 160
},
{
"epoch": 1.701453104359313,
"grad_norm": 0.29349567743537014,
"learning_rate": 6.649923902784178e-05,
"loss": 0.6049,
"step": 161
},
{
"epoch": 1.7120211360634081,
"grad_norm": 0.37774756737109916,
"learning_rate": 6.627597593302772e-05,
"loss": 0.6004,
"step": 162
},
{
"epoch": 1.7225891677675031,
"grad_norm": 0.34520927764325005,
"learning_rate": 6.605126347955376e-05,
"loss": 0.5987,
"step": 163
},
{
"epoch": 1.7331571994715984,
"grad_norm": 0.37358546648640356,
"learning_rate": 6.58251140623521e-05,
"loss": 0.6037,
"step": 164
},
{
"epoch": 1.7437252311756937,
"grad_norm": 0.3803331681549159,
"learning_rate": 6.559754015561655e-05,
"loss": 0.6057,
"step": 165
},
{
"epoch": 1.7542932628797887,
"grad_norm": 0.419367642300496,
"learning_rate": 6.536855431211445e-05,
"loss": 0.6006,
"step": 166
},
{
"epoch": 1.7648612945838837,
"grad_norm": 0.3700144217552004,
"learning_rate": 6.513816916249427e-05,
"loss": 0.6029,
"step": 167
},
{
"epoch": 1.7754293262879788,
"grad_norm": 0.3304108780784769,
"learning_rate": 6.490639741458891e-05,
"loss": 0.609,
"step": 168
},
{
"epoch": 1.7859973579920738,
"grad_norm": 0.38491358241980217,
"learning_rate": 6.46732518527148e-05,
"loss": 0.5992,
"step": 169
},
{
"epoch": 1.796565389696169,
"grad_norm": 0.41792070367182416,
"learning_rate": 6.443874533696662e-05,
"loss": 0.5997,
"step": 170
},
{
"epoch": 1.8071334214002643,
"grad_norm": 0.31857578884550497,
"learning_rate": 6.420289080250804e-05,
"loss": 0.5976,
"step": 171
},
{
"epoch": 1.8177014531043594,
"grad_norm": 0.2921197050452141,
"learning_rate": 6.396570125885823e-05,
"loss": 0.6028,
"step": 172
},
{
"epoch": 1.8282694848084544,
"grad_norm": 0.32956079744279115,
"learning_rate": 6.372718978917421e-05,
"loss": 0.5959,
"step": 173
},
{
"epoch": 1.8388375165125495,
"grad_norm": 0.3363640406873935,
"learning_rate": 6.348736954952923e-05,
"loss": 0.5979,
"step": 174
},
{
"epoch": 1.8494055482166445,
"grad_norm": 0.2827997180555132,
"learning_rate": 6.324625376818707e-05,
"loss": 0.5963,
"step": 175
},
{
"epoch": 1.8599735799207398,
"grad_norm": 0.24952707846951472,
"learning_rate": 6.300385574487242e-05,
"loss": 0.5962,
"step": 176
},
{
"epoch": 1.870541611624835,
"grad_norm": 0.30585781576809906,
"learning_rate": 6.276018885003727e-05,
"loss": 0.5979,
"step": 177
},
{
"epoch": 1.88110964332893,
"grad_norm": 0.35383483624278156,
"learning_rate": 6.251526652412335e-05,
"loss": 0.5975,
"step": 178
},
{
"epoch": 1.891677675033025,
"grad_norm": 0.3057303385681727,
"learning_rate": 6.226910227682087e-05,
"loss": 0.5974,
"step": 179
},
{
"epoch": 1.9022457067371201,
"grad_norm": 0.3227218227112165,
"learning_rate": 6.202170968632324e-05,
"loss": 0.5984,
"step": 180
},
{
"epoch": 1.9128137384412152,
"grad_norm": 0.3099468501510001,
"learning_rate": 6.177310239857815e-05,
"loss": 0.6006,
"step": 181
},
{
"epoch": 1.9233817701453104,
"grad_norm": 0.3029754372937815,
"learning_rate": 6.152329412653491e-05,
"loss": 0.5937,
"step": 182
},
{
"epoch": 1.9339498018494057,
"grad_norm": 0.31003590660857305,
"learning_rate": 6.127229864938798e-05,
"loss": 0.6031,
"step": 183
},
{
"epoch": 1.9445178335535007,
"grad_norm": 0.4090438325379304,
"learning_rate": 6.1020129811816985e-05,
"loss": 0.5995,
"step": 184
},
{
"epoch": 1.9550858652575958,
"grad_norm": 0.47046399479064277,
"learning_rate": 6.076680152322302e-05,
"loss": 0.597,
"step": 185
},
{
"epoch": 1.9656538969616908,
"grad_norm": 0.4389393519962548,
"learning_rate": 6.051232775696143e-05,
"loss": 0.6003,
"step": 186
},
{
"epoch": 1.9762219286657858,
"grad_norm": 0.3946851961823208,
"learning_rate": 6.025672254957106e-05,
"loss": 0.5961,
"step": 187
},
{
"epoch": 1.986789960369881,
"grad_norm": 0.3581648113750019,
"learning_rate": 6.000000000000001e-05,
"loss": 0.6014,
"step": 188
},
{
"epoch": 1.9973579920739764,
"grad_norm": 0.32698044327640924,
"learning_rate": 5.9742174268827936e-05,
"loss": 0.6018,
"step": 189
},
{
"epoch": 2.0079260237780714,
"grad_norm": 0.3725595521245188,
"learning_rate": 5.948325957748498e-05,
"loss": 0.5527,
"step": 190
},
{
"epoch": 2.0184940554821664,
"grad_norm": 0.40702510391048585,
"learning_rate": 5.9223270207467355e-05,
"loss": 0.5457,
"step": 191
},
{
"epoch": 2.0290620871862615,
"grad_norm": 0.5137742231756651,
"learning_rate": 5.896222049954951e-05,
"loss": 0.5365,
"step": 192
},
{
"epoch": 2.0396301188903565,
"grad_norm": 0.5833504408833431,
"learning_rate": 5.870012485299318e-05,
"loss": 0.5339,
"step": 193
},
{
"epoch": 2.050198150594452,
"grad_norm": 0.5020674212672895,
"learning_rate": 5.843699772475312e-05,
"loss": 0.5344,
"step": 194
},
{
"epoch": 2.060766182298547,
"grad_norm": 0.5109669618772644,
"learning_rate": 5.8172853628679676e-05,
"loss": 0.5373,
"step": 195
},
{
"epoch": 2.071334214002642,
"grad_norm": 0.6079777296591362,
"learning_rate": 5.790770713471816e-05,
"loss": 0.5338,
"step": 196
},
{
"epoch": 2.081902245706737,
"grad_norm": 0.7717054202322884,
"learning_rate": 5.764157286810527e-05,
"loss": 0.5413,
"step": 197
},
{
"epoch": 2.092470277410832,
"grad_norm": 1.0263835129177117,
"learning_rate": 5.7374465508562324e-05,
"loss": 0.5419,
"step": 198
},
{
"epoch": 2.103038309114927,
"grad_norm": 1.1304188828796682,
"learning_rate": 5.710639978948555e-05,
"loss": 0.5401,
"step": 199
},
{
"epoch": 2.1136063408190227,
"grad_norm": 0.5561546704438505,
"learning_rate": 5.6837390497133406e-05,
"loss": 0.5371,
"step": 200
},
{
"epoch": 2.1241743725231177,
"grad_norm": 0.5094878513846206,
"learning_rate": 5.6567452469810984e-05,
"loss": 0.5307,
"step": 201
},
{
"epoch": 2.1347424042272127,
"grad_norm": 0.861314347361121,
"learning_rate": 5.629660059705153e-05,
"loss": 0.5405,
"step": 202
},
{
"epoch": 2.1453104359313078,
"grad_norm": 0.7744305400405808,
"learning_rate": 5.602484981879519e-05,
"loss": 0.5399,
"step": 203
},
{
"epoch": 2.155878467635403,
"grad_norm": 0.5568154422551339,
"learning_rate": 5.5752215124564895e-05,
"loss": 0.534,
"step": 204
},
{
"epoch": 2.166446499339498,
"grad_norm": 0.4932283903917851,
"learning_rate": 5.547871155263955e-05,
"loss": 0.5427,
"step": 205
},
{
"epoch": 2.1770145310435933,
"grad_norm": 0.6026007057555373,
"learning_rate": 5.5204354189224596e-05,
"loss": 0.5372,
"step": 206
},
{
"epoch": 2.1875825627476884,
"grad_norm": 0.5907730425638954,
"learning_rate": 5.492915816761979e-05,
"loss": 0.5339,
"step": 207
},
{
"epoch": 2.1981505944517834,
"grad_norm": 0.38702716198497655,
"learning_rate": 5.465313866738454e-05,
"loss": 0.5399,
"step": 208
},
{
"epoch": 2.2087186261558784,
"grad_norm": 0.4818332449688006,
"learning_rate": 5.4376310913500514e-05,
"loss": 0.5341,
"step": 209
},
{
"epoch": 2.2192866578599735,
"grad_norm": 0.48527023542347636,
"learning_rate": 5.409869017553199e-05,
"loss": 0.5443,
"step": 210
},
{
"epoch": 2.2298546895640685,
"grad_norm": 0.38988747831854015,
"learning_rate": 5.382029176678345e-05,
"loss": 0.5325,
"step": 211
},
{
"epoch": 2.240422721268164,
"grad_norm": 0.3382464323770342,
"learning_rate": 5.354113104345503e-05,
"loss": 0.5381,
"step": 212
},
{
"epoch": 2.250990752972259,
"grad_norm": 0.344596068112714,
"learning_rate": 5.326122340379539e-05,
"loss": 0.5393,
"step": 213
},
{
"epoch": 2.261558784676354,
"grad_norm": 0.38043335701597286,
"learning_rate": 5.2980584287252456e-05,
"loss": 0.5354,
"step": 214
},
{
"epoch": 2.272126816380449,
"grad_norm": 0.31330954952683393,
"learning_rate": 5.269922917362171e-05,
"loss": 0.5347,
"step": 215
},
{
"epoch": 2.282694848084544,
"grad_norm": 0.3394061867349534,
"learning_rate": 5.241717358219239e-05,
"loss": 0.5359,
"step": 216
},
{
"epoch": 2.293262879788639,
"grad_norm": 0.353132616874619,
"learning_rate": 5.213443307089144e-05,
"loss": 0.53,
"step": 217
},
{
"epoch": 2.3038309114927347,
"grad_norm": 0.2871048446747761,
"learning_rate": 5.1851023235425366e-05,
"loss": 0.5396,
"step": 218
},
{
"epoch": 2.3143989431968297,
"grad_norm": 0.3676997391938742,
"learning_rate": 5.156695970841997e-05,
"loss": 0.5319,
"step": 219
},
{
"epoch": 2.3249669749009247,
"grad_norm": 0.3034692647102483,
"learning_rate": 5.128225815855805e-05,
"loss": 0.539,
"step": 220
},
{
"epoch": 2.33553500660502,
"grad_norm": 0.29670953140630685,
"learning_rate": 5.099693428971522e-05,
"loss": 0.5357,
"step": 221
},
{
"epoch": 2.346103038309115,
"grad_norm": 0.3462570910330776,
"learning_rate": 5.0711003840093583e-05,
"loss": 0.5382,
"step": 222
},
{
"epoch": 2.35667107001321,
"grad_norm": 0.30320212822308745,
"learning_rate": 5.042448258135371e-05,
"loss": 0.5398,
"step": 223
},
{
"epoch": 2.3672391017173053,
"grad_norm": 0.3253569159099642,
"learning_rate": 5.013738631774463e-05,
"loss": 0.5403,
"step": 224
},
{
"epoch": 2.3778071334214004,
"grad_norm": 0.2992173696284864,
"learning_rate": 4.984973088523216e-05,
"loss": 0.5318,
"step": 225
},
{
"epoch": 2.3883751651254954,
"grad_norm": 0.23806839402955893,
"learning_rate": 4.9561532150625305e-05,
"loss": 0.5295,
"step": 226
},
{
"epoch": 2.3989431968295905,
"grad_norm": 0.2836158566149504,
"learning_rate": 4.927280601070114e-05,
"loss": 0.5273,
"step": 227
},
{
"epoch": 2.4095112285336855,
"grad_norm": 0.24042182902261217,
"learning_rate": 4.898356839132793e-05,
"loss": 0.5302,
"step": 228
},
{
"epoch": 2.4200792602377805,
"grad_norm": 0.23090066826959527,
"learning_rate": 4.869383524658668e-05,
"loss": 0.5378,
"step": 229
},
{
"epoch": 2.430647291941876,
"grad_norm": 0.23036421955638248,
"learning_rate": 4.840362255789112e-05,
"loss": 0.5324,
"step": 230
},
{
"epoch": 2.441215323645971,
"grad_norm": 0.2086043982624101,
"learning_rate": 4.811294633310617e-05,
"loss": 0.5355,
"step": 231
},
{
"epoch": 2.451783355350066,
"grad_norm": 0.2662049795738657,
"learning_rate": 4.782182260566498e-05,
"loss": 0.5321,
"step": 232
},
{
"epoch": 2.462351387054161,
"grad_norm": 0.17282690102361992,
"learning_rate": 4.7530267433684546e-05,
"loss": 0.5322,
"step": 233
},
{
"epoch": 2.472919418758256,
"grad_norm": 0.23839732691466814,
"learning_rate": 4.723829689907993e-05,
"loss": 0.5332,
"step": 234
},
{
"epoch": 2.483487450462351,
"grad_norm": 0.220755685419314,
"learning_rate": 4.694592710667723e-05,
"loss": 0.5289,
"step": 235
},
{
"epoch": 2.4940554821664467,
"grad_norm": 0.17887769697236458,
"learning_rate": 4.665317418332521e-05,
"loss": 0.5302,
"step": 236
},
{
"epoch": 2.5046235138705417,
"grad_norm": 0.21214814477629076,
"learning_rate": 4.6360054277005826e-05,
"loss": 0.5311,
"step": 237
},
{
"epoch": 2.5151915455746368,
"grad_norm": 0.169870987072969,
"learning_rate": 4.606658355594344e-05,
"loss": 0.5308,
"step": 238
},
{
"epoch": 2.525759577278732,
"grad_norm": 0.23328757071293307,
"learning_rate": 4.577277820771307e-05,
"loss": 0.531,
"step": 239
},
{
"epoch": 2.536327608982827,
"grad_norm": 0.2082393856633939,
"learning_rate": 4.5478654438347414e-05,
"loss": 0.5369,
"step": 240
},
{
"epoch": 2.5468956406869223,
"grad_norm": 0.14478055321484706,
"learning_rate": 4.518422847144304e-05,
"loss": 0.5323,
"step": 241
},
{
"epoch": 2.557463672391017,
"grad_norm": 0.22316005053591764,
"learning_rate": 4.488951654726539e-05,
"loss": 0.5286,
"step": 242
},
{
"epoch": 2.5680317040951124,
"grad_norm": 0.17285901709749715,
"learning_rate": 4.4594534921853096e-05,
"loss": 0.5362,
"step": 243
},
{
"epoch": 2.5785997357992074,
"grad_norm": 0.2122181023189799,
"learning_rate": 4.429929986612125e-05,
"loss": 0.5351,
"step": 244
},
{
"epoch": 2.5891677675033025,
"grad_norm": 0.19160763459008834,
"learning_rate": 4.400382766496394e-05,
"loss": 0.5322,
"step": 245
},
{
"epoch": 2.5997357992073975,
"grad_norm": 0.17931339221155976,
"learning_rate": 4.3708134616355934e-05,
"loss": 0.5291,
"step": 246
},
{
"epoch": 2.6103038309114925,
"grad_norm": 0.18662561546217404,
"learning_rate": 4.341223703045379e-05,
"loss": 0.5348,
"step": 247
},
{
"epoch": 2.620871862615588,
"grad_norm": 0.15830313964721893,
"learning_rate": 4.311615122869613e-05,
"loss": 0.5434,
"step": 248
},
{
"epoch": 2.631439894319683,
"grad_norm": 0.19402467010937804,
"learning_rate": 4.281989354290341e-05,
"loss": 0.5433,
"step": 249
},
{
"epoch": 2.642007926023778,
"grad_norm": 0.20042343107408042,
"learning_rate": 4.2523480314376996e-05,
"loss": 0.5327,
"step": 250
},
{
"epoch": 2.652575957727873,
"grad_norm": 0.165259104609407,
"learning_rate": 4.222692789299794e-05,
"loss": 0.5389,
"step": 251
},
{
"epoch": 2.663143989431968,
"grad_norm": 0.21602254674705257,
"learning_rate": 4.193025263632495e-05,
"loss": 0.5254,
"step": 252
},
{
"epoch": 2.6737120211360637,
"grad_norm": 0.1679859643732669,
"learning_rate": 4.163347090869227e-05,
"loss": 0.5375,
"step": 253
},
{
"epoch": 2.6842800528401582,
"grad_norm": 0.17225237105164173,
"learning_rate": 4.133659908030699e-05,
"loss": 0.5342,
"step": 254
},
{
"epoch": 2.6948480845442537,
"grad_norm": 0.1620251583234328,
"learning_rate": 4.103965352634604e-05,
"loss": 0.5328,
"step": 255
},
{
"epoch": 2.7054161162483488,
"grad_norm": 0.16680355622763784,
"learning_rate": 4.0742650626053004e-05,
"loss": 0.5293,
"step": 256
},
{
"epoch": 2.715984147952444,
"grad_norm": 0.15521980569419397,
"learning_rate": 4.044560676183462e-05,
"loss": 0.5371,
"step": 257
},
{
"epoch": 2.726552179656539,
"grad_norm": 0.1574325532000935,
"learning_rate": 4.014853831835721e-05,
"loss": 0.5331,
"step": 258
},
{
"epoch": 2.737120211360634,
"grad_norm": 0.16871282749949595,
"learning_rate": 3.985146168164281e-05,
"loss": 0.5388,
"step": 259
},
{
"epoch": 2.7476882430647294,
"grad_norm": 0.1553640449532965,
"learning_rate": 3.9554393238165386e-05,
"loss": 0.5284,
"step": 260
},
{
"epoch": 2.7582562747688244,
"grad_norm": 0.15753872116807552,
"learning_rate": 3.9257349373947016e-05,
"loss": 0.5342,
"step": 261
},
{
"epoch": 2.7688243064729194,
"grad_norm": 0.1510488137537568,
"learning_rate": 3.896034647365398e-05,
"loss": 0.5329,
"step": 262
},
{
"epoch": 2.7793923381770145,
"grad_norm": 0.17598454072482386,
"learning_rate": 3.866340091969303e-05,
"loss": 0.5344,
"step": 263
},
{
"epoch": 2.7899603698811095,
"grad_norm": 0.16623830505420029,
"learning_rate": 3.836652909130774e-05,
"loss": 0.5273,
"step": 264
},
{
"epoch": 2.800528401585205,
"grad_norm": 0.1959528022905974,
"learning_rate": 3.806974736367507e-05,
"loss": 0.5324,
"step": 265
},
{
"epoch": 2.8110964332893,
"grad_norm": 0.14859823867573574,
"learning_rate": 3.7773072107002084e-05,
"loss": 0.5334,
"step": 266
},
{
"epoch": 2.821664464993395,
"grad_norm": 0.19136907727546654,
"learning_rate": 3.747651968562302e-05,
"loss": 0.5283,
"step": 267
},
{
"epoch": 2.83223249669749,
"grad_norm": 0.1741838181803494,
"learning_rate": 3.718010645709661e-05,
"loss": 0.5374,
"step": 268
},
{
"epoch": 2.842800528401585,
"grad_norm": 0.15429083348412015,
"learning_rate": 3.688384877130388e-05,
"loss": 0.5348,
"step": 269
},
{
"epoch": 2.85336856010568,
"grad_norm": 0.19719371528147198,
"learning_rate": 3.658776296954622e-05,
"loss": 0.5339,
"step": 270
},
{
"epoch": 2.8639365918097752,
"grad_norm": 0.15789471821578627,
"learning_rate": 3.629186538364408e-05,
"loss": 0.5339,
"step": 271
},
{
"epoch": 2.8745046235138707,
"grad_norm": 0.16444809265187393,
"learning_rate": 3.5996172335036065e-05,
"loss": 0.54,
"step": 272
},
{
"epoch": 2.8850726552179657,
"grad_norm": 0.18205663719249998,
"learning_rate": 3.570070013387876e-05,
"loss": 0.5326,
"step": 273
},
{
"epoch": 2.895640686922061,
"grad_norm": 0.1638276415587385,
"learning_rate": 3.540546507814692e-05,
"loss": 0.5336,
"step": 274
},
{
"epoch": 2.906208718626156,
"grad_norm": 0.14430321990734493,
"learning_rate": 3.5110483452734633e-05,
"loss": 0.5333,
"step": 275
},
{
"epoch": 2.916776750330251,
"grad_norm": 0.15779067389210275,
"learning_rate": 3.4815771528556976e-05,
"loss": 0.5295,
"step": 276
},
{
"epoch": 2.9273447820343463,
"grad_norm": 0.16510325621420335,
"learning_rate": 3.452134556165259e-05,
"loss": 0.5322,
"step": 277
},
{
"epoch": 2.9379128137384414,
"grad_norm": 0.15335282480316584,
"learning_rate": 3.4227221792286945e-05,
"loss": 0.5266,
"step": 278
},
{
"epoch": 2.9484808454425364,
"grad_norm": 0.14207500884610488,
"learning_rate": 3.393341644405657e-05,
"loss": 0.5284,
"step": 279
},
{
"epoch": 2.9590488771466315,
"grad_norm": 0.14817401425997304,
"learning_rate": 3.363994572299418e-05,
"loss": 0.5305,
"step": 280
},
{
"epoch": 2.9696169088507265,
"grad_norm": 0.13417873579347814,
"learning_rate": 3.3346825816674804e-05,
"loss": 0.5285,
"step": 281
},
{
"epoch": 2.9801849405548215,
"grad_norm": 0.1933271122465297,
"learning_rate": 3.305407289332279e-05,
"loss": 0.5336,
"step": 282
},
{
"epoch": 2.9907529722589166,
"grad_norm": 0.14082276570620816,
"learning_rate": 3.276170310092008e-05,
"loss": 0.5314,
"step": 283
},
{
"epoch": 3.001321003963012,
"grad_norm": 0.16572738564748216,
"learning_rate": 3.246973256631546e-05,
"loss": 0.5233,
"step": 284
},
{
"epoch": 3.011889035667107,
"grad_norm": 0.27880171253028246,
"learning_rate": 3.217817739433502e-05,
"loss": 0.4778,
"step": 285
},
{
"epoch": 3.022457067371202,
"grad_norm": 0.2862871579036325,
"learning_rate": 3.1887053666893834e-05,
"loss": 0.4793,
"step": 286
},
{
"epoch": 3.033025099075297,
"grad_norm": 0.25228138426980523,
"learning_rate": 3.159637744210888e-05,
"loss": 0.4754,
"step": 287
},
{
"epoch": 3.043593130779392,
"grad_norm": 0.2079690373301947,
"learning_rate": 3.130616475341332e-05,
"loss": 0.4723,
"step": 288
},
{
"epoch": 3.0541611624834872,
"grad_norm": 0.18869207117374362,
"learning_rate": 3.101643160867208e-05,
"loss": 0.4762,
"step": 289
},
{
"epoch": 3.0647291941875827,
"grad_norm": 0.22869432322360056,
"learning_rate": 3.072719398929887e-05,
"loss": 0.4737,
"step": 290
},
{
"epoch": 3.0752972258916778,
"grad_norm": 0.15890007803087935,
"learning_rate": 3.0438467849374702e-05,
"loss": 0.473,
"step": 291
},
{
"epoch": 3.085865257595773,
"grad_norm": 0.210513738135415,
"learning_rate": 3.0150269114767862e-05,
"loss": 0.4767,
"step": 292
},
{
"epoch": 3.096433289299868,
"grad_norm": 0.18412299124477924,
"learning_rate": 2.9862613682255383e-05,
"loss": 0.4761,
"step": 293
},
{
"epoch": 3.107001321003963,
"grad_norm": 0.16086763510358643,
"learning_rate": 2.957551741864631e-05,
"loss": 0.4743,
"step": 294
},
{
"epoch": 3.117569352708058,
"grad_norm": 0.1836629217850037,
"learning_rate": 2.928899615990643e-05,
"loss": 0.475,
"step": 295
},
{
"epoch": 3.1281373844121534,
"grad_norm": 0.16351418203553833,
"learning_rate": 2.90030657102848e-05,
"loss": 0.478,
"step": 296
},
{
"epoch": 3.1387054161162484,
"grad_norm": 0.1543546082591788,
"learning_rate": 2.8717741841441964e-05,
"loss": 0.4736,
"step": 297
},
{
"epoch": 3.1492734478203435,
"grad_norm": 0.14811662444817492,
"learning_rate": 2.8433040291580053e-05,
"loss": 0.4814,
"step": 298
},
{
"epoch": 3.1598414795244385,
"grad_norm": 0.15161900774346382,
"learning_rate": 2.8148976764574648e-05,
"loss": 0.4723,
"step": 299
},
{
"epoch": 3.1704095112285335,
"grad_norm": 0.14342849635394805,
"learning_rate": 2.7865566929108573e-05,
"loss": 0.4761,
"step": 300
},
{
"epoch": 3.180977542932629,
"grad_norm": 0.1660043564516897,
"learning_rate": 2.758282641780762e-05,
"loss": 0.4797,
"step": 301
},
{
"epoch": 3.191545574636724,
"grad_norm": 0.141920732705004,
"learning_rate": 2.7300770826378302e-05,
"loss": 0.4782,
"step": 302
},
{
"epoch": 3.202113606340819,
"grad_norm": 0.14950725725564776,
"learning_rate": 2.7019415712747558e-05,
"loss": 0.4748,
"step": 303
},
{
"epoch": 3.212681638044914,
"grad_norm": 0.14208430168695735,
"learning_rate": 2.6738776596204624e-05,
"loss": 0.4748,
"step": 304
},
{
"epoch": 3.223249669749009,
"grad_norm": 0.14961017547836333,
"learning_rate": 2.6458868956544984e-05,
"loss": 0.4803,
"step": 305
},
{
"epoch": 3.233817701453104,
"grad_norm": 0.15645442679040064,
"learning_rate": 2.6179708233216557e-05,
"loss": 0.4758,
"step": 306
},
{
"epoch": 3.2443857331571992,
"grad_norm": 0.1537770287650884,
"learning_rate": 2.590130982446802e-05,
"loss": 0.4737,
"step": 307
},
{
"epoch": 3.2549537648612947,
"grad_norm": 0.14713329423087662,
"learning_rate": 2.5623689086499496e-05,
"loss": 0.4733,
"step": 308
},
{
"epoch": 3.2655217965653898,
"grad_norm": 0.15291508052862732,
"learning_rate": 2.5346861332615476e-05,
"loss": 0.4782,
"step": 309
},
{
"epoch": 3.276089828269485,
"grad_norm": 0.13920928762276993,
"learning_rate": 2.5070841832380212e-05,
"loss": 0.4709,
"step": 310
},
{
"epoch": 3.28665785997358,
"grad_norm": 0.1465474915994889,
"learning_rate": 2.4795645810775414e-05,
"loss": 0.4826,
"step": 311
},
{
"epoch": 3.297225891677675,
"grad_norm": 0.1316301584217805,
"learning_rate": 2.4521288447360457e-05,
"loss": 0.4731,
"step": 312
},
{
"epoch": 3.3077939233817704,
"grad_norm": 0.1461801975123496,
"learning_rate": 2.424778487543512e-05,
"loss": 0.4734,
"step": 313
},
{
"epoch": 3.3183619550858654,
"grad_norm": 0.12719699667003487,
"learning_rate": 2.3975150181204817e-05,
"loss": 0.4737,
"step": 314
},
{
"epoch": 3.3289299867899604,
"grad_norm": 0.13764467132837127,
"learning_rate": 2.370339940294848e-05,
"loss": 0.4703,
"step": 315
},
{
"epoch": 3.3394980184940555,
"grad_norm": 0.12510614551548813,
"learning_rate": 2.3432547530189033e-05,
"loss": 0.4786,
"step": 316
},
{
"epoch": 3.3500660501981505,
"grad_norm": 0.13577816786562447,
"learning_rate": 2.316260950286661e-05,
"loss": 0.4775,
"step": 317
},
{
"epoch": 3.3606340819022456,
"grad_norm": 0.13089413394472932,
"learning_rate": 2.2893600210514464e-05,
"loss": 0.4756,
"step": 318
},
{
"epoch": 3.3712021136063406,
"grad_norm": 0.1333288679621952,
"learning_rate": 2.2625534491437672e-05,
"loss": 0.4811,
"step": 319
},
{
"epoch": 3.381770145310436,
"grad_norm": 0.13920771603982868,
"learning_rate": 2.2358427131894732e-05,
"loss": 0.4815,
"step": 320
},
{
"epoch": 3.392338177014531,
"grad_norm": 0.12578897335151773,
"learning_rate": 2.2092292865281845e-05,
"loss": 0.477,
"step": 321
},
{
"epoch": 3.402906208718626,
"grad_norm": 0.1406551312359615,
"learning_rate": 2.1827146371320334e-05,
"loss": 0.4761,
"step": 322
},
{
"epoch": 3.413474240422721,
"grad_norm": 0.11847800398887365,
"learning_rate": 2.156300227524688e-05,
"loss": 0.4794,
"step": 323
},
{
"epoch": 3.4240422721268162,
"grad_norm": 0.1289493089958156,
"learning_rate": 2.1299875147006838e-05,
"loss": 0.4803,
"step": 324
},
{
"epoch": 3.4346103038309117,
"grad_norm": 0.12509059453118254,
"learning_rate": 2.10377795004505e-05,
"loss": 0.4774,
"step": 325
},
{
"epoch": 3.4451783355350067,
"grad_norm": 0.13580138707758002,
"learning_rate": 2.0776729792532652e-05,
"loss": 0.4733,
"step": 326
},
{
"epoch": 3.455746367239102,
"grad_norm": 0.12405647314750252,
"learning_rate": 2.0516740422515022e-05,
"loss": 0.4762,
"step": 327
},
{
"epoch": 3.466314398943197,
"grad_norm": 0.14580809381579368,
"learning_rate": 2.0257825731172077e-05,
"loss": 0.4764,
"step": 328
},
{
"epoch": 3.476882430647292,
"grad_norm": 0.1281590923924517,
"learning_rate": 2.0000000000000012e-05,
"loss": 0.4706,
"step": 329
},
{
"epoch": 3.487450462351387,
"grad_norm": 0.1298579667890172,
"learning_rate": 1.9743277450428962e-05,
"loss": 0.4725,
"step": 330
},
{
"epoch": 3.498018494055482,
"grad_norm": 0.13901143297798416,
"learning_rate": 1.9487672243038594e-05,
"loss": 0.4811,
"step": 331
},
{
"epoch": 3.5085865257595774,
"grad_norm": 0.12006662533988818,
"learning_rate": 1.9233198476777003e-05,
"loss": 0.4755,
"step": 332
},
{
"epoch": 3.5191545574636725,
"grad_norm": 0.13509800282835904,
"learning_rate": 1.897987018818302e-05,
"loss": 0.4736,
"step": 333
},
{
"epoch": 3.5297225891677675,
"grad_norm": 0.12372407695325513,
"learning_rate": 1.8727701350612026e-05,
"loss": 0.4771,
"step": 334
},
{
"epoch": 3.5402906208718625,
"grad_norm": 0.13277097757447007,
"learning_rate": 1.8476705873465097e-05,
"loss": 0.477,
"step": 335
},
{
"epoch": 3.5508586525759576,
"grad_norm": 0.1190584597907601,
"learning_rate": 1.8226897601421858e-05,
"loss": 0.472,
"step": 336
},
{
"epoch": 3.561426684280053,
"grad_norm": 0.13360122007454706,
"learning_rate": 1.7978290313676774e-05,
"loss": 0.4812,
"step": 337
},
{
"epoch": 3.571994715984148,
"grad_norm": 0.12353280522144573,
"learning_rate": 1.7730897723179144e-05,
"loss": 0.4735,
"step": 338
},
{
"epoch": 3.582562747688243,
"grad_norm": 0.12197612268234642,
"learning_rate": 1.748473347587666e-05,
"loss": 0.4758,
"step": 339
},
{
"epoch": 3.593130779392338,
"grad_norm": 0.13321021249065187,
"learning_rate": 1.7239811149962756e-05,
"loss": 0.4777,
"step": 340
},
{
"epoch": 3.603698811096433,
"grad_norm": 0.11696601002999722,
"learning_rate": 1.6996144255127586e-05,
"loss": 0.473,
"step": 341
},
{
"epoch": 3.6142668428005287,
"grad_norm": 0.12376625663933627,
"learning_rate": 1.675374623181294e-05,
"loss": 0.475,
"step": 342
},
{
"epoch": 3.6248348745046233,
"grad_norm": 0.11118193452823608,
"learning_rate": 1.6512630450470784e-05,
"loss": 0.4679,
"step": 343
},
{
"epoch": 3.6354029062087188,
"grad_norm": 0.1194452215053386,
"learning_rate": 1.6272810210825794e-05,
"loss": 0.4748,
"step": 344
},
{
"epoch": 3.645970937912814,
"grad_norm": 0.12011227175979526,
"learning_rate": 1.6034298741141768e-05,
"loss": 0.4766,
"step": 345
},
{
"epoch": 3.656538969616909,
"grad_norm": 0.10988264863047191,
"learning_rate": 1.579710919749196e-05,
"loss": 0.4735,
"step": 346
},
{
"epoch": 3.667107001321004,
"grad_norm": 0.11947936011899343,
"learning_rate": 1.5561254663033393e-05,
"loss": 0.4801,
"step": 347
},
{
"epoch": 3.677675033025099,
"grad_norm": 0.11387001376651971,
"learning_rate": 1.53267481472852e-05,
"loss": 0.4761,
"step": 348
},
{
"epoch": 3.6882430647291944,
"grad_norm": 0.10828606028620547,
"learning_rate": 1.5093602585411078e-05,
"loss": 0.4745,
"step": 349
},
{
"epoch": 3.6988110964332894,
"grad_norm": 0.1158137481167655,
"learning_rate": 1.4861830837505733e-05,
"loss": 0.4739,
"step": 350
},
{
"epoch": 3.7093791281373845,
"grad_norm": 0.11883853717597995,
"learning_rate": 1.4631445687885553e-05,
"loss": 0.479,
"step": 351
},
{
"epoch": 3.7199471598414795,
"grad_norm": 0.10711524384366743,
"learning_rate": 1.4402459844383451e-05,
"loss": 0.4751,
"step": 352
},
{
"epoch": 3.7305151915455745,
"grad_norm": 0.1179009653082842,
"learning_rate": 1.4174885937647905e-05,
"loss": 0.4782,
"step": 353
},
{
"epoch": 3.74108322324967,
"grad_norm": 0.11366521654185796,
"learning_rate": 1.3948736520446246e-05,
"loss": 0.4727,
"step": 354
},
{
"epoch": 3.7516512549537646,
"grad_norm": 0.10920060902225881,
"learning_rate": 1.372402406697229e-05,
"loss": 0.4725,
"step": 355
},
{
"epoch": 3.76221928665786,
"grad_norm": 0.12313189801852109,
"learning_rate": 1.3500760972158223e-05,
"loss": 0.4809,
"step": 356
},
{
"epoch": 3.772787318361955,
"grad_norm": 0.11049761112743338,
"learning_rate": 1.3278959550991011e-05,
"loss": 0.4782,
"step": 357
},
{
"epoch": 3.78335535006605,
"grad_norm": 0.10430827010366775,
"learning_rate": 1.3058632037832957e-05,
"loss": 0.4743,
"step": 358
},
{
"epoch": 3.793923381770145,
"grad_norm": 0.11402898251067686,
"learning_rate": 1.2839790585747008e-05,
"loss": 0.4767,
"step": 359
},
{
"epoch": 3.8044914134742402,
"grad_norm": 0.11162325433379153,
"learning_rate": 1.2622447265826345e-05,
"loss": 0.4812,
"step": 360
},
{
"epoch": 3.8150594451783357,
"grad_norm": 0.10511190972273543,
"learning_rate": 1.2406614066528543e-05,
"loss": 0.4768,
"step": 361
},
{
"epoch": 3.8256274768824308,
"grad_norm": 0.10681643814143399,
"learning_rate": 1.219230289301431e-05,
"loss": 0.4717,
"step": 362
},
{
"epoch": 3.836195508586526,
"grad_norm": 0.11345833004338865,
"learning_rate": 1.1979525566490845e-05,
"loss": 0.4694,
"step": 363
},
{
"epoch": 3.846763540290621,
"grad_norm": 0.10629326422923135,
"learning_rate": 1.176829382355973e-05,
"loss": 0.475,
"step": 364
},
{
"epoch": 3.857331571994716,
"grad_norm": 0.10487140068250281,
"learning_rate": 1.1558619315569572e-05,
"loss": 0.4727,
"step": 365
},
{
"epoch": 3.8678996036988114,
"grad_norm": 0.10353148274468497,
"learning_rate": 1.1350513607973351e-05,
"loss": 0.4749,
"step": 366
},
{
"epoch": 3.878467635402906,
"grad_norm": 0.11029487277113502,
"learning_rate": 1.1143988179690441e-05,
"loss": 0.4756,
"step": 367
},
{
"epoch": 3.8890356671070014,
"grad_norm": 0.10444430423294958,
"learning_rate": 1.093905442247348e-05,
"loss": 0.4715,
"step": 368
},
{
"epoch": 3.8996036988110965,
"grad_norm": 0.10279747300542211,
"learning_rate": 1.073572364027999e-05,
"loss": 0.4701,
"step": 369
},
{
"epoch": 3.9101717305151915,
"grad_norm": 0.10814918538118634,
"learning_rate": 1.0534007048648846e-05,
"loss": 0.4773,
"step": 370
},
{
"epoch": 3.9207397622192866,
"grad_norm": 0.10568724216385697,
"learning_rate": 1.0333915774081698e-05,
"loss": 0.4742,
"step": 371
},
{
"epoch": 3.9313077939233816,
"grad_norm": 0.10446279448267286,
"learning_rate": 1.0135460853429166e-05,
"loss": 0.4703,
"step": 372
},
{
"epoch": 3.941875825627477,
"grad_norm": 0.10539141636554224,
"learning_rate": 9.938653233282105e-06,
"loss": 0.4788,
"step": 373
},
{
"epoch": 3.952443857331572,
"grad_norm": 0.10404420155804553,
"learning_rate": 9.74350376936779e-06,
"loss": 0.4766,
"step": 374
},
{
"epoch": 3.963011889035667,
"grad_norm": 0.11291514851116513,
"learning_rate": 9.550023225951124e-06,
"loss": 0.4786,
"step": 375
},
{
"epoch": 3.973579920739762,
"grad_norm": 0.11054229268613286,
"learning_rate": 9.358222275240884e-06,
"loss": 0.4715,
"step": 376
},
{
"epoch": 3.984147952443857,
"grad_norm": 0.10047846272456959,
"learning_rate": 9.168111496801071e-06,
"loss": 0.4767,
"step": 377
},
{
"epoch": 3.9947159841479527,
"grad_norm": 0.10427667029654042,
"learning_rate": 8.979701376967313e-06,
"loss": 0.472,
"step": 378
},
{
"epoch": 4.005284015852047,
"grad_norm": 0.15160742471081606,
"learning_rate": 8.793002308268521e-06,
"loss": 0.4599,
"step": 379
},
{
"epoch": 4.015852047556143,
"grad_norm": 0.18761042591080226,
"learning_rate": 8.608024588853561e-06,
"loss": 0.4482,
"step": 380
},
{
"epoch": 4.026420079260237,
"grad_norm": 0.12647969828019093,
"learning_rate": 8.424778421923258e-06,
"loss": 0.4377,
"step": 381
},
{
"epoch": 4.036988110964333,
"grad_norm": 0.1707261073721691,
"learning_rate": 8.24327391516761e-06,
"loss": 0.4438,
"step": 382
},
{
"epoch": 4.047556142668428,
"grad_norm": 0.20005656576433853,
"learning_rate": 8.06352108020823e-06,
"loss": 0.4427,
"step": 383
},
{
"epoch": 4.058124174372523,
"grad_norm": 0.14945783232518764,
"learning_rate": 7.885529832046134e-06,
"loss": 0.4404,
"step": 384
},
{
"epoch": 4.068692206076618,
"grad_norm": 0.13331270252131155,
"learning_rate": 7.709309988514824e-06,
"loss": 0.438,
"step": 385
},
{
"epoch": 4.079260237780713,
"grad_norm": 0.1373177545653382,
"learning_rate": 7.534871269738753e-06,
"loss": 0.442,
"step": 386
},
{
"epoch": 4.0898282694848085,
"grad_norm": 0.14428406658074794,
"learning_rate": 7.362223297597184e-06,
"loss": 0.444,
"step": 387
},
{
"epoch": 4.100396301188904,
"grad_norm": 0.1403209142453575,
"learning_rate": 7.191375595193433e-06,
"loss": 0.4423,
"step": 388
},
{
"epoch": 4.110964332892999,
"grad_norm": 0.12198908241232029,
"learning_rate": 7.022337586329597e-06,
"loss": 0.4436,
"step": 389
},
{
"epoch": 4.121532364597094,
"grad_norm": 0.11000767534504179,
"learning_rate": 6.855118594986718e-06,
"loss": 0.4443,
"step": 390
},
{
"epoch": 4.132100396301189,
"grad_norm": 0.1243765198168875,
"learning_rate": 6.6897278448105405e-06,
"loss": 0.4389,
"step": 391
},
{
"epoch": 4.142668428005284,
"grad_norm": 0.1326492943126363,
"learning_rate": 6.526174458602681e-06,
"loss": 0.4461,
"step": 392
},
{
"epoch": 4.153236459709379,
"grad_norm": 0.11686776812880073,
"learning_rate": 6.364467457817482e-06,
"loss": 0.4384,
"step": 393
},
{
"epoch": 4.163804491413474,
"grad_norm": 0.10054607780091933,
"learning_rate": 6.20461576206433e-06,
"loss": 0.4389,
"step": 394
},
{
"epoch": 4.17437252311757,
"grad_norm": 0.11593044593000827,
"learning_rate": 6.046628188615718e-06,
"loss": 0.4398,
"step": 395
},
{
"epoch": 4.184940554821664,
"grad_norm": 0.11430820998845044,
"learning_rate": 5.890513451920843e-06,
"loss": 0.4367,
"step": 396
},
{
"epoch": 4.19550858652576,
"grad_norm": 0.11287261939767183,
"learning_rate": 5.736280163124974e-06,
"loss": 0.4444,
"step": 397
},
{
"epoch": 4.206076618229854,
"grad_norm": 0.10321211605129811,
"learning_rate": 5.583936829594434e-06,
"loss": 0.4436,
"step": 398
},
{
"epoch": 4.21664464993395,
"grad_norm": 0.10361621115480053,
"learning_rate": 5.4334918544473436e-06,
"loss": 0.4405,
"step": 399
},
{
"epoch": 4.227212681638045,
"grad_norm": 0.10863502177296244,
"learning_rate": 5.284953536090131e-06,
"loss": 0.443,
"step": 400
},
{
"epoch": 4.23778071334214,
"grad_norm": 0.10536497157476482,
"learning_rate": 5.1383300677598024e-06,
"loss": 0.4497,
"step": 401
},
{
"epoch": 4.248348745046235,
"grad_norm": 0.10402891881587309,
"learning_rate": 4.993629537071978e-06,
"loss": 0.4396,
"step": 402
},
{
"epoch": 4.25891677675033,
"grad_norm": 0.10369536487473348,
"learning_rate": 4.850859925574809e-06,
"loss": 0.4393,
"step": 403
},
{
"epoch": 4.2694848084544255,
"grad_norm": 0.10111203263697728,
"learning_rate": 4.710029108308733e-06,
"loss": 0.441,
"step": 404
},
{
"epoch": 4.28005284015852,
"grad_norm": 0.09912151235410817,
"learning_rate": 4.571144853372063e-06,
"loss": 0.4403,
"step": 405
},
{
"epoch": 4.2906208718626155,
"grad_norm": 0.09757157331199569,
"learning_rate": 4.434214821492542e-06,
"loss": 0.4473,
"step": 406
},
{
"epoch": 4.301188903566711,
"grad_norm": 0.1002102756021715,
"learning_rate": 4.299246565604756e-06,
"loss": 0.4383,
"step": 407
},
{
"epoch": 4.311756935270806,
"grad_norm": 0.10119754294494109,
"learning_rate": 4.166247530433531e-06,
"loss": 0.4363,
"step": 408
},
{
"epoch": 4.322324966974901,
"grad_norm": 0.10100539004557267,
"learning_rate": 4.035225052083309e-06,
"loss": 0.4406,
"step": 409
},
{
"epoch": 4.332892998678996,
"grad_norm": 0.09823672032855243,
"learning_rate": 3.906186357633455e-06,
"loss": 0.4432,
"step": 410
},
{
"epoch": 4.343461030383091,
"grad_norm": 0.09520265410670203,
"learning_rate": 3.779138564739646e-06,
"loss": 0.436,
"step": 411
},
{
"epoch": 4.354029062087187,
"grad_norm": 0.10024695357191145,
"learning_rate": 3.6540886812412547e-06,
"loss": 0.4405,
"step": 412
},
{
"epoch": 4.364597093791281,
"grad_norm": 0.10079785013841713,
"learning_rate": 3.5310436047748263e-06,
"loss": 0.4403,
"step": 413
},
{
"epoch": 4.375165125495377,
"grad_norm": 0.09516762618496769,
"learning_rate": 3.4100101223935743e-06,
"loss": 0.4408,
"step": 414
},
{
"epoch": 4.385733157199471,
"grad_norm": 0.09404930151960895,
"learning_rate": 3.290994910193037e-06,
"loss": 0.4442,
"step": 415
},
{
"epoch": 4.396301188903567,
"grad_norm": 0.09531190586303208,
"learning_rate": 3.174004532942845e-06,
"loss": 0.4451,
"step": 416
},
{
"epoch": 4.406869220607662,
"grad_norm": 0.0951071973632443,
"learning_rate": 3.059045443724582e-06,
"loss": 0.4423,
"step": 417
},
{
"epoch": 4.417437252311757,
"grad_norm": 0.09705103462701054,
"learning_rate": 2.946123983575846e-06,
"loss": 0.4372,
"step": 418
},
{
"epoch": 4.428005284015852,
"grad_norm": 0.09365563642961511,
"learning_rate": 2.8352463811404952e-06,
"loss": 0.4397,
"step": 419
},
{
"epoch": 4.438573315719947,
"grad_norm": 0.09694934512919057,
"learning_rate": 2.726418752325084e-06,
"loss": 0.4465,
"step": 420
},
{
"epoch": 4.449141347424042,
"grad_norm": 0.09546394710630239,
"learning_rate": 2.619647099961502e-06,
"loss": 0.4457,
"step": 421
},
{
"epoch": 4.459709379128137,
"grad_norm": 0.09255730663301787,
"learning_rate": 2.514937313475865e-06,
"loss": 0.4405,
"step": 422
},
{
"epoch": 4.4702774108322325,
"grad_norm": 0.09338235711048254,
"learning_rate": 2.4122951685636674e-06,
"loss": 0.4395,
"step": 423
},
{
"epoch": 4.480845442536328,
"grad_norm": 0.09435019171855656,
"learning_rate": 2.3117263268712e-06,
"loss": 0.44,
"step": 424
},
{
"epoch": 4.491413474240423,
"grad_norm": 0.09413409031470825,
"learning_rate": 2.2132363356832532e-06,
"loss": 0.4432,
"step": 425
},
{
"epoch": 4.501981505944518,
"grad_norm": 0.09468855592447012,
"learning_rate": 2.11683062761713e-06,
"loss": 0.4422,
"step": 426
},
{
"epoch": 4.512549537648613,
"grad_norm": 0.09055553100177934,
"learning_rate": 2.0225145203230044e-06,
"loss": 0.4401,
"step": 427
},
{
"epoch": 4.523117569352708,
"grad_norm": 0.09082079754983816,
"learning_rate": 1.930293216190586e-06,
"loss": 0.4367,
"step": 428
},
{
"epoch": 4.533685601056803,
"grad_norm": 0.09114595588995444,
"learning_rate": 1.8401718020621694e-06,
"loss": 0.4383,
"step": 429
},
{
"epoch": 4.544253632760898,
"grad_norm": 0.09147630992219807,
"learning_rate": 1.7521552489520566e-06,
"loss": 0.4455,
"step": 430
},
{
"epoch": 4.554821664464994,
"grad_norm": 0.09327852060770747,
"learning_rate": 1.666248411772342e-06,
"loss": 0.441,
"step": 431
},
{
"epoch": 4.565389696169088,
"grad_norm": 0.09307772965424148,
"learning_rate": 1.5824560290651404e-06,
"loss": 0.444,
"step": 432
},
{
"epoch": 4.575957727873184,
"grad_norm": 0.09415356635848661,
"learning_rate": 1.5007827227411942e-06,
"loss": 0.4417,
"step": 433
},
{
"epoch": 4.586525759577278,
"grad_norm": 0.09176517213906957,
"learning_rate": 1.4212329978249417e-06,
"loss": 0.4425,
"step": 434
},
{
"epoch": 4.597093791281374,
"grad_norm": 0.09036761344520468,
"learning_rate": 1.3438112422060256e-06,
"loss": 0.4416,
"step": 435
},
{
"epoch": 4.607661822985469,
"grad_norm": 0.0920008670611069,
"learning_rate": 1.2685217263972693e-06,
"loss": 0.4433,
"step": 436
},
{
"epoch": 4.618229854689564,
"grad_norm": 0.08954221258416922,
"learning_rate": 1.1953686032990964e-06,
"loss": 0.4431,
"step": 437
},
{
"epoch": 4.628797886393659,
"grad_norm": 0.0897729908602636,
"learning_rate": 1.124355907970487e-06,
"loss": 0.4451,
"step": 438
},
{
"epoch": 4.639365918097754,
"grad_norm": 0.09192361815684204,
"learning_rate": 1.0554875574063784e-06,
"loss": 0.4423,
"step": 439
},
{
"epoch": 4.6499339498018495,
"grad_norm": 0.09033170522999512,
"learning_rate": 9.887673503216422e-07,
"loss": 0.4394,
"step": 440
},
{
"epoch": 4.660501981505945,
"grad_norm": 0.09133489415949655,
"learning_rate": 9.241989669415097e-07,
"loss": 0.4447,
"step": 441
},
{
"epoch": 4.67107001321004,
"grad_norm": 0.09425014878930163,
"learning_rate": 8.61785968798623e-07,
"loss": 0.4397,
"step": 442
},
{
"epoch": 4.681638044914135,
"grad_norm": 0.09161607478315605,
"learning_rate": 8.015317985365301e-07,
"loss": 0.4435,
"step": 443
},
{
"epoch": 4.69220607661823,
"grad_norm": 0.0915131016919999,
"learning_rate": 7.434397797198367e-07,
"loss": 0.4392,
"step": 444
},
{
"epoch": 4.702774108322325,
"grad_norm": 0.09011982457022855,
"learning_rate": 6.875131166508553e-07,
"loss": 0.4429,
"step": 445
},
{
"epoch": 4.71334214002642,
"grad_norm": 0.09105298330396881,
"learning_rate": 6.337548941928839e-07,
"loss": 0.4452,
"step": 446
},
{
"epoch": 4.723910171730515,
"grad_norm": 0.08866673271037578,
"learning_rate": 5.821680776000049e-07,
"loss": 0.4422,
"step": 447
},
{
"epoch": 4.734478203434611,
"grad_norm": 0.08801181304315894,
"learning_rate": 5.32755512353571e-07,
"loss": 0.436,
"step": 448
},
{
"epoch": 4.745046235138705,
"grad_norm": 0.0892771815810736,
"learning_rate": 4.8551992400522e-07,
"loss": 0.4452,
"step": 449
},
{
"epoch": 4.755614266842801,
"grad_norm": 0.08829094089099762,
"learning_rate": 4.4046391802655463e-07,
"loss": 0.4422,
"step": 450
},
{
"epoch": 4.766182298546895,
"grad_norm": 0.09129909001389838,
"learning_rate": 3.975899796654137e-07,
"loss": 0.443,
"step": 451
},
{
"epoch": 4.776750330250991,
"grad_norm": 0.0898229105447953,
"learning_rate": 3.569004738087989e-07,
"loss": 0.4408,
"step": 452
},
{
"epoch": 4.787318361955085,
"grad_norm": 0.08880239621448514,
"learning_rate": 3.183976448524106e-07,
"loss": 0.4375,
"step": 453
},
{
"epoch": 4.797886393659181,
"grad_norm": 0.08857557909002102,
"learning_rate": 2.8208361657688474e-07,
"loss": 0.4422,
"step": 454
},
{
"epoch": 4.808454425363276,
"grad_norm": 0.08766599065540119,
"learning_rate": 2.479603920306106e-07,
"loss": 0.4358,
"step": 455
},
{
"epoch": 4.819022457067371,
"grad_norm": 0.09087441316527663,
"learning_rate": 2.1602985341925953e-07,
"loss": 0.4366,
"step": 456
},
{
"epoch": 4.8295904887714665,
"grad_norm": 0.0907938224300551,
"learning_rate": 1.8629376200197004e-07,
"loss": 0.442,
"step": 457
},
{
"epoch": 4.840158520475561,
"grad_norm": 0.08888968582423885,
"learning_rate": 1.5875375799419e-07,
"loss": 0.4358,
"step": 458
},
{
"epoch": 4.8507265521796565,
"grad_norm": 0.08900710504460911,
"learning_rate": 1.3341136047719805e-07,
"loss": 0.4367,
"step": 459
},
{
"epoch": 4.861294583883752,
"grad_norm": 0.08986360536362852,
"learning_rate": 1.1026796731433475e-07,
"loss": 0.4414,
"step": 460
},
{
"epoch": 4.871862615587847,
"grad_norm": 0.08885451973141226,
"learning_rate": 8.932485507387345e-08,
"loss": 0.4368,
"step": 461
},
{
"epoch": 4.882430647291942,
"grad_norm": 0.08728339714966311,
"learning_rate": 7.058317895861866e-08,
"loss": 0.445,
"step": 462
},
{
"epoch": 4.892998678996037,
"grad_norm": 0.08758661927375506,
"learning_rate": 5.40439727421882e-08,
"loss": 0.442,
"step": 463
},
{
"epoch": 4.903566710700132,
"grad_norm": 0.0906444113153008,
"learning_rate": 3.970814871197437e-08,
"loss": 0.4387,
"step": 464
},
{
"epoch": 4.914134742404228,
"grad_norm": 0.08754952023250234,
"learning_rate": 2.75764976188464e-08,
"loss": 0.4401,
"step": 465
},
{
"epoch": 4.924702774108322,
"grad_norm": 0.08803257540350204,
"learning_rate": 1.764968863351424e-08,
"loss": 0.4401,
"step": 466
},
{
"epoch": 4.935270805812418,
"grad_norm": 0.08863746420839871,
"learning_rate": 9.928269309638083e-09,
"loss": 0.4398,
"step": 467
},
{
"epoch": 4.945838837516512,
"grad_norm": 0.08825690980969667,
"learning_rate": 4.412665553594764e-09,
"loss": 0.4478,
"step": 468
},
{
"epoch": 4.956406869220608,
"grad_norm": 0.09184591938217317,
"learning_rate": 1.103181601020964e-09,
"loss": 0.4439,
"step": 469
},
{
"epoch": 4.966974900924702,
"grad_norm": 0.08777993584647581,
"learning_rate": 0.0,
"loss": 0.4386,
"step": 470
},
{
"epoch": 4.966974900924702,
"step": 470,
"total_flos": 1.2094498954511647e+19,
"train_loss": 0.5709579251548077,
"train_runtime": 47190.5732,
"train_samples_per_second": 5.131,
"train_steps_per_second": 0.01
}
],
"logging_steps": 1,
"max_steps": 470,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2094498954511647e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}